diff --git a/CHANGELOG.md b/CHANGELOG.md
index c99f975d..3a6730f1 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -19,7 +19,6 @@ New entries are automatically added by [release-please](https://github.com/googl
## [3.2.1](https://github.com/FZJ-IEK3-VSA/tsam/compare/v3.2.0...v3.2.1) (2026-03-25)
-
### Bug Fixes
* use column weights in tuning RMSE objective ([#227](https://github.com/FZJ-IEK3-VSA/tsam/issues/227)) ([1ceee5c](https://github.com/FZJ-IEK3-VSA/tsam/commit/1ceee5c69856b61aed9eae3f5d5f713be8ac85e9)), closes [#226](https://github.com/FZJ-IEK3-VSA/tsam/issues/226)
@@ -83,7 +82,7 @@ See the [migration guide](migration-guide.md) for a complete guide on upgrading
- `cluster_representatives`: DataFrame with aggregated typical periods
- `cluster_assignments`: Which cluster each original period belongs to
- - `cluster_weights`: Occurrence count per cluster
+ - `cluster_counts`: Occurrence count per cluster (fractional for partial periods)
- `accuracy`: `AccuracyMetrics` object with RMSE, MAE, and duration curve RMSE
- `reconstructed`: Reconstructed time series (cached property)
- `residuals`: Difference between original and reconstructed
diff --git a/docs/api/SUMMARY.md b/docs/api/SUMMARY.md
deleted file mode 100644
index c7862239..00000000
--- a/docs/api/SUMMARY.md
+++ /dev/null
@@ -1,17 +0,0 @@
-* tsam
- * [api](tsam/api.md)
- * [config](tsam/config.md)
- * [exceptions](tsam/exceptions.md)
- * [hyperparametertuning](tsam/hyperparametertuning.md)
- * [periodAggregation](tsam/periodAggregation.md)
- * [plot](tsam/plot.md)
- * [representations](tsam/representations.md)
- * [result](tsam/result.md)
- * [timeseriesaggregation](tsam/timeseriesaggregation.md)
- * [tuning](tsam/tuning.md)
- * utils
- * [durationRepresentation](tsam/utils/durationRepresentation.md)
- * [k_maxoids](tsam/utils/k_maxoids.md)
- * [k_medoids_contiguity](tsam/utils/k_medoids_contiguity.md)
- * [k_medoids_exact](tsam/utils/k_medoids_exact.md)
- * [segmentation](tsam/utils/segmentation.md)
diff --git a/docs/api/tsam/api.md b/docs/api/tsam/api.md
deleted file mode 100644
index 9e42f132..00000000
--- a/docs/api/tsam/api.md
+++ /dev/null
@@ -1,3 +0,0 @@
-# tsam.api
-
-::: tsam.api
diff --git a/docs/api/tsam/config.md b/docs/api/tsam/config.md
deleted file mode 100644
index 574b0d53..00000000
--- a/docs/api/tsam/config.md
+++ /dev/null
@@ -1,3 +0,0 @@
-# tsam.config
-
-::: tsam.config
diff --git a/docs/api/tsam/exceptions.md b/docs/api/tsam/exceptions.md
deleted file mode 100644
index ad986465..00000000
--- a/docs/api/tsam/exceptions.md
+++ /dev/null
@@ -1,3 +0,0 @@
-# tsam.exceptions
-
-::: tsam.exceptions
diff --git a/docs/api/tsam/hyperparametertuning.md b/docs/api/tsam/hyperparametertuning.md
deleted file mode 100644
index 26b3f635..00000000
--- a/docs/api/tsam/hyperparametertuning.md
+++ /dev/null
@@ -1,3 +0,0 @@
-# tsam.hyperparametertuning
-
-::: tsam.hyperparametertuning
diff --git a/docs/api/tsam/periodAggregation.md b/docs/api/tsam/periodAggregation.md
deleted file mode 100644
index 3cf61ac2..00000000
--- a/docs/api/tsam/periodAggregation.md
+++ /dev/null
@@ -1,3 +0,0 @@
-# tsam.periodAggregation
-
-::: tsam.periodAggregation
diff --git a/docs/api/tsam/plot.md b/docs/api/tsam/plot.md
deleted file mode 100644
index e39e480a..00000000
--- a/docs/api/tsam/plot.md
+++ /dev/null
@@ -1,3 +0,0 @@
-# tsam.plot
-
-::: tsam.plot
diff --git a/docs/api/tsam/representations.md b/docs/api/tsam/representations.md
deleted file mode 100644
index 5defc12a..00000000
--- a/docs/api/tsam/representations.md
+++ /dev/null
@@ -1,3 +0,0 @@
-# tsam.representations
-
-::: tsam.representations
diff --git a/docs/api/tsam/result.md b/docs/api/tsam/result.md
deleted file mode 100644
index 7db18384..00000000
--- a/docs/api/tsam/result.md
+++ /dev/null
@@ -1,3 +0,0 @@
-# tsam.result
-
-::: tsam.result
diff --git a/docs/api/tsam/timeseriesaggregation.md b/docs/api/tsam/timeseriesaggregation.md
deleted file mode 100644
index 3aa0633f..00000000
--- a/docs/api/tsam/timeseriesaggregation.md
+++ /dev/null
@@ -1,3 +0,0 @@
-# tsam.timeseriesaggregation
-
-::: tsam.timeseriesaggregation
diff --git a/docs/api/tsam/tuning.md b/docs/api/tsam/tuning.md
deleted file mode 100644
index 1819101d..00000000
--- a/docs/api/tsam/tuning.md
+++ /dev/null
@@ -1,3 +0,0 @@
-# tsam.tuning
-
-::: tsam.tuning
diff --git a/docs/api/tsam/utils/durationRepresentation.md b/docs/api/tsam/utils/durationRepresentation.md
deleted file mode 100644
index ccfe57d9..00000000
--- a/docs/api/tsam/utils/durationRepresentation.md
+++ /dev/null
@@ -1,3 +0,0 @@
-# tsam.utils.durationRepresentation
-
-::: tsam.utils.durationRepresentation
diff --git a/docs/api/tsam/utils/k_maxoids.md b/docs/api/tsam/utils/k_maxoids.md
deleted file mode 100644
index b7482499..00000000
--- a/docs/api/tsam/utils/k_maxoids.md
+++ /dev/null
@@ -1,3 +0,0 @@
-# tsam.utils.k_maxoids
-
-::: tsam.utils.k_maxoids
diff --git a/docs/api/tsam/utils/k_medoids_contiguity.md b/docs/api/tsam/utils/k_medoids_contiguity.md
deleted file mode 100644
index 23468303..00000000
--- a/docs/api/tsam/utils/k_medoids_contiguity.md
+++ /dev/null
@@ -1,3 +0,0 @@
-# tsam.utils.k_medoids_contiguity
-
-::: tsam.utils.k_medoids_contiguity
diff --git a/docs/api/tsam/utils/k_medoids_exact.md b/docs/api/tsam/utils/k_medoids_exact.md
deleted file mode 100644
index 491e4be8..00000000
--- a/docs/api/tsam/utils/k_medoids_exact.md
+++ /dev/null
@@ -1,3 +0,0 @@
-# tsam.utils.k_medoids_exact
-
-::: tsam.utils.k_medoids_exact
diff --git a/docs/api/tsam/utils/segmentation.md b/docs/api/tsam/utils/segmentation.md
deleted file mode 100644
index 8f9bcf77..00000000
--- a/docs/api/tsam/utils/segmentation.md
+++ /dev/null
@@ -1,3 +0,0 @@
-# tsam.utils.segmentation
-
-::: tsam.utils.segmentation
diff --git a/docs/background/pipeline_guide.md b/docs/background/pipeline_guide.md
new file mode 100644
index 00000000..b27c3f3a
--- /dev/null
+++ b/docs/background/pipeline_guide.md
@@ -0,0 +1,671 @@
+# Pipeline Guide
+
+This guide walks through the tsam aggregation pipeline from start to finish.
+It is written for two audiences:
+
+- **Users** who want to understand what happens when they call `tsam.aggregate()`
+ and how configuration choices affect the result.
+- **Developers** who need to modify or extend the pipeline code.
+
+Each section explains one pipeline step: *what* it does, *why* it exists,
+and *where* the code lives.
+
+---
+
+## Overview
+
+A call to `tsam.aggregate()` produces an `AggregationResult` by running
+16 sequential steps. The diagram below shows the high-level flow:
+
+```
+ data (DataFrame)
+ │
+ ▼
+ ┌─────────────┐
+ │ Normalize │ Step 1 — scale to [0,1], column-mean normalization (no weights)
+ └──────┬───────┘
+ ▼
+ ┌─────────────┐
+ │ Unstack │ Step 2 — reshape flat time series into period × timestep matrix
+ └──────┬───────┘
+ ▼
+ ┌─────────────┐
+ │ Weight │ Step 2b — apply per-column weights to a copy for clustering distance
+ └──────┬───────┘
+ ▼
+ ┌─────────────┐
+ │ Augment │ Step 3 — optionally append period-sum features (to weighted copy)
+ └──────┬───────┘
+ ▼
+ ┌─────────────┐
+ │ Cluster │ Step 4 — group similar periods using weighted candidates
+ └──────┬───────┘
+ ▼
+ ┌─────────────┐
+ │ Trim │ Step 5 — remove augmented features from cluster centers
+ └──────┬───────┘
+ ▼
+ ┌─────────────┐
+ │ Extremes │ Step 6 — optionally add/replace extreme-value periods
+ └──────┬───────┘
+ ▼
+ ┌─────────────┐
+ │ Weights │ Step 7 — count how many original periods each cluster represents
+ └──────┬───────┘
+ ▼
+ ┌─────────────┐
+ │ Rescale │ Step 8 — adjust representatives so column means match the original
+ └──────┬───────┘
+ ▼
+ ┌─────────────┐
+ │ Partial │ Step 9 — adjust weight for the last period if the series doesn't divide evenly
+ └──────┬───────┘
+ ▼
+ ┌─────────────┐
+ │ Format │ Step 10 — reshape flat vectors back into a MultiIndex DataFrame
+ └──────┬───────┘
+ ▼
+ ┌─────────────┐
+ │ Segment │ Step 11 — optionally reduce intra-period resolution
+ └──────┬───────┘
+ ▼
+ ┌─────────────┐
+ │ Denormalize│ Step 12 — invert normalization back to original units
+ └──────┬───────┘
+ ▼
+ ┌─────────────┐
+ │ Bounds │ Step 13 — warn if aggregated values exceed original min/max
+ └──────┬───────┘
+ ▼
+ ┌─────────────┐
+ │ Reconstruct│ Step 14 — expand typical periods back to full time series & measure accuracy
+ └──────┬───────┘
+ ▼
+ ┌─────────────┐
+ │ Metadata │ Step 15 — assemble ClusteringResult for serialization/transfer
+ └──────┬───────┘
+ ▼
+ ┌─────────────┐
+ │ Return │ Step 16 — pack everything into PipelineResult → AggregationResult
+ └──────┴───────┘
+```
+
+---
+
+## Entry points
+
+There are two ways into the pipeline. Both end up calling `run_pipeline()`.
+
+### `tsam.aggregate()` — the primary API
+
+```python
+import tsam
+from tsam import ClusterConfig, SegmentConfig, ExtremeConfig
+
+result = tsam.aggregate(
+ data, # DataFrame with DatetimeIndex
+ n_clusters=8, # how many typical periods
+ period_duration=24, # hours (or '1d', '24h')
+ temporal_resolution=1.0, # hours (or '1h', '15min'); auto-inferred if omitted
+ cluster=ClusterConfig(), # clustering options
+ segments=SegmentConfig(n_segments=8), # optional intra-period segmentation
+ extremes=ExtremeConfig(max_value=["demand"]), # optional extremes
+ weights={"demand": 2.0}, # per-column clustering weights (optional)
+ preserve_column_means=True, # rescale so totals match
+ rescale_exclude_columns=None, # columns to skip during rescaling
+ round_decimals=None, # optional output rounding
+ numerical_tolerance=1e-13, # tolerance for bounds-check warnings
+)
+```
+
+`aggregate()` validates inputs, computes `n_timesteps_per_period` from
+`period_duration / temporal_resolution`, and calls `run_pipeline()`.
+
+### `ClusteringResult.apply()` — transfer to new data
+
+```python
+# Cluster one dataset, apply the same structure to another
+result1 = tsam.aggregate(df_wind, n_clusters=8)
+result2 = result1.clustering.apply(df_all)
+```
+
+`apply()` reconstructs a `PredefParams` object from the stored clustering
+assignments and calls `run_pipeline()` with those predefined assignments
+instead of running clustering from scratch.
+
+---
+
+## Step-by-step walkthrough
+
+### Step 1: Normalize
+
+| | |
+|---|---|
+| **Module** | `pipeline/normalize.py` |
+| **Function** | `normalize()` |
+| **Config** | `ClusterConfig.normalize_column_means` |
+| **Output** | `NormalizedData` |
+
+This step prepares the raw data for clustering by removing scale
+differences between columns.
+
+**What happens:**
+
+1. **Sort columns** alphabetically (deterministic column order).
+2. **Cast to float** (in case of integer columns).
+3. **Min-max scale** each column to [0, 1] using scikit-learn's
+ `MinMaxScaler`. The fitted scaler is stored for later inversion.
+4. **Column-mean normalization** (optional, `normalize_column_means=True`):
+ divide each column by its mean so all columns have equal weight
+ regardless of their typical magnitude. Useful when columns have very
+ different average levels.
+
+**Weights are NOT applied here.** Per-column weights (`ClusterConfig.weights`)
+are applied in step 2b to a separate copy used only for clustering distance.
+The `NormalizedData` produced here contains unweighted normalized values that
+flow through all downstream steps (rescaling, denormalization, reconstruction,
+accuracy) without any weight compensation.
+
+**Why it matters:** Without normalization, columns with larger numeric ranges
+dominate the clustering distance. A temperature column ranging 0–40 would
+overshadow a solar capacity factor ranging 0–1.
+
+**Developer note:** The `NormalizedData` object is the most widely-used
+intermediate — it is read by nearly every subsequent step.
+
+---
+
+### Step 2: Unstack to periods
+
+| | |
+|---|---|
+| **Module** | `pipeline/periods.py` |
+| **Function** | `unstack_to_periods()` |
+| **Output** | `PeriodProfiles` |
+
+Reshapes the flat time series into a matrix where each row is one period
+and each column is `(attribute, timestep)`.
+
+**Example:** With 365 days of hourly data for 3 columns, the input is a
+(8760, 3) DataFrame. After unstacking with `n_timesteps_per_period=24`,
+the profiles matrix is (365, 72) — each row is a 72-dimensional point
+(3 columns × 24 hours).
+
+If the time series length is not evenly divisible by the period length,
+the last period is padded by repeating initial rows.
+
+---
+
+### Step 2b: Apply column weights (optional)
+
+| | |
+|---|---|
+| **Module** | `pipeline/__init__.py` |
+| **Functions** | `_build_weight_vector()` |
+| **Config** | `weights` (top-level parameter) |
+
+If `weights` is provided, weights are baked directly into the
+candidates array via vectorized multiply (`np.repeat` + broadcast). The
+`weight_vector` (`np.ndarray`) is stored on `PreparedData` for later
+unweighting.
+
+This means:
+- Weights influence *which* periods get grouped together (clustering distance),
+ *which* period is chosen as representative (medoid/maxoid selection), and
+ `new_cluster` extreme reassignment distances (step 6).
+- After extremes, all representatives are unweighted (step 6b) before
+ downstream steps (rescale, denormalization) which expect unweighted data.
+
+If no weights are provided, candidates pass through unchanged.
+
+---
+
+### Step 3: Add period-sum features (optional)
+
+| | |
+|---|---|
+| **Module** | `pipeline/periods.py` |
+| **Function** | `add_period_sum_features()` |
+| **Config** | `ClusterConfig.include_period_sums` |
+
+Appends the per-column sum of each period as extra features. When weights
+are active, the sums are appended to the **weighted** candidates (they are
+clustering features). When no weights are active, they are appended to the
+regular candidates. Either way, the extra columns are removed from the
+cluster centers in step 5 — they only influence which periods get grouped
+together.
+
+---
+
+### Step 4: Cluster
+
+| | |
+|---|---|
+| **Module** | `pipeline/clustering.py` |
+| **Config** | `ClusterConfig.method`, `.representation`, `.solver`, `.use_duration_curves` |
+| **Output** | `cluster_centers`, `cluster_center_indices`, `cluster_order` |
+
+This is the core step. It groups the period profiles into `n_clusters`
+clusters and selects or computes a representative for each.
+
+Candidates are already weighted (from step 2b). Representatives are
+computed from weighted candidates; unweighting happens later (step 6b).
+The result: cluster assignments reflect weighted
+importance, but typical-period values are in the original normalized space.
+
+**Clustering methods** (`ClusterConfig.method`):
+
+| Method | Description |
+|---|---|
+| `"hierarchical"` | Agglomerative (Ward linkage). Default. Deterministic. |
+| `"kmeans"` | K-means. Fast but non-deterministic (set random seed externally). |
+| `"kmedoids"` | Exact k-medoids via MILP. Slow but optimal. |
+| `"kmaxoids"` | K-maxoids heuristic. |
+| `"averaging"` | Simple period averaging (1 cluster = mean of all). |
+| `"contiguous"` | Adjacent periods only (preserves temporal order). |
+
+**Representation methods** (`ClusterConfig.representation`):
+
+After clustering, each cluster needs a representative period. The choice
+controls what the typical period looks like:
+
+| Representation | Description |
+|---|---|
+| `"mean"` | Arithmetic mean of cluster members. |
+| `"medoid"` | The real period closest to the cluster center. Default. |
+| `"maxoid"` | The real period farthest from the center. |
+| `"distribution"` | Duration-curve fit: sorts values to preserve the statistical distribution. |
+| `"distribution_minmax"` | Like `"distribution"` but also preserves extreme values. |
+| `"minmax_mean"` | Separate min/max/mean per column. |
+| `Distribution(...)` | Fine-grained control over distribution representation. |
+| `MinMaxMean(...)` | Fine-grained control over which columns get min/max treatment. |
+
+**Duration-curve clustering** (`use_duration_curves=True`): Sorts each
+period's values before clustering, so periods are grouped by value
+distribution rather than temporal shape. Useful when the ordering within
+a period doesn't matter (e.g., energy storage optimization).
+
+**Transfer path:** When `predef` is provided (via `ClusteringResult.apply()`),
+clustering is skipped entirely. The stored assignments and centers are
+reused as-is.
+
+---
+
+### Step 5: Trim augmented features
+
+Inline in `_cluster_and_postprocess()`.
+
+If period-sum features were added in step 3, the extra columns are
+stripped from each cluster center vector, restoring the original
+dimensionality. Representatives are still weighted at this point.
+
+---
+
+### Step 6: Add extreme periods (optional)
+
+| | |
+|---|---|
+| **Module** | `pipeline/extremes.py` |
+| **Function** | `add_extreme_periods()` |
+| **Config** | `ExtremeConfig` |
+
+Extremes run in **weighted space** (matching develop's behavior): when
+weights are active, `profiles_dataframe` is weighted before being passed
+to `add_extreme_periods()`. Extreme detection itself (per-column
+idxmax/idxmin) is weight-invariant, but the `new_cluster` method's
+distance-based reassignment respects weights. Extracted extreme profiles
+carry weights, which are removed uniformly in step 6b.
+
+Ensures that periods with extreme values (peak demand, minimum solar, etc.)
+are explicitly represented in the output rather than averaged away by
+clustering.
+
+### Step 6b: Unweight all representatives
+
+Inline in `_cluster_and_postprocess()`.
+
+Divides weights back out of all representatives (regular + extreme) using
+the stored `weight_vector`. After this point, all data is in unweighted
+normalized space for rescale, denormalization, and reconstruction.
+
+**Extreme types:**
+
+| Config field | What it preserves |
+|---|---|
+| `max_value=["demand"]` | The period containing the single highest demand value. |
+| `min_value=["solar"]` | The period containing the single lowest solar value. |
+| `max_period=["demand"]` | The period with the highest average demand. |
+| `min_period=["solar"]` | The period with the lowest average solar. |
+
+**Methods** (`ExtremeConfig.method`):
+
+| Method | Behavior |
+|---|---|
+| `"append"` | Adds extreme periods as new clusters (increases `n_clusters`). Default. |
+| `"new_cluster"` | Like append, but also reassigns nearby periods to the new cluster. |
+| `"replace"` | Overwrites the relevant column values in the nearest existing cluster center. |
+
+---
+
+### Step 7: Compute cluster weights
+
+Inline in `run_pipeline()`.
+
+Counts how many original periods are assigned to each cluster. The result
+is a dictionary like `{0: 45, 1: 52, 2: 38, ...}`. These weights are
+used for:
+
+- Rescaling (step 8) — the weighted sum must match the original total.
+- Downstream optimization models — each typical period represents
+ `weight` real periods.
+
+---
+
+### Step 8: Rescale representatives (optional)
+
+| | |
+|---|---|
+| **Module** | `pipeline/rescale.py` |
+| **Function** | `rescale_representatives()` |
+| **Config** | `preserve_column_means` (= `rescale_cluster_periods`) |
+
+**Problem:** Clustering can shift column means. If you aggregate
+365 daily load profiles into 8 typical days, the weighted average of the
+8 representatives may not match the original annual average.
+
+**Solution:** Iteratively scale each column of each non-extreme cluster
+center until the weighted sum matches the original total (within tolerance).
+Values are clipped to `[0, scale_ub]` where `scale_ub` depends on
+`normalize_column_means` (ratio of max to mean). Because the data is
+unweighted at this point, no weight compensation is needed for the
+clipping bound.
+
+Extreme clusters (from step 6) are excluded from rescaling to preserve
+their extreme values.
+
+Columns listed in `rescale_exclude_columns` are also skipped — useful for
+binary columns (0/1) that shouldn't be scaled.
+
+---
+
+### Step 9: Adjust for partial periods
+
+Inline in `run_pipeline()`.
+
+If the time series doesn't divide evenly into periods (e.g., 8761 hours
+with 24-hour periods), the last period is padded in step 2. Here, its
+cluster weight is reduced proportionally so the total weight is correct.
+
+---
+
+### Step 10: Format representatives to DataFrame
+
+| | |
+|---|---|
+| **Function** | `_representatives_to_dataframe()` |
+
+Reshapes the flat 1-D cluster center vectors back into a DataFrame with
+a `(PeriodNum, TimeStep)` MultiIndex. This is the `normalized_typical_periods`
+DataFrame used by subsequent steps.
+
+---
+
+### Step 11: Segment typical periods (optional)
+
+| | |
+|---|---|
+| **Module** | `pipeline/segment.py` |
+| **Config** | `SegmentConfig.n_segments`, `.representation` |
+
+**Problem:** Even after clustering, each typical period still has the
+full temporal resolution (e.g., 24 hourly timesteps). Some optimization
+models need fewer timesteps.
+
+**Solution:** Within each typical period, adjacent timesteps with similar
+values are merged into segments. If `n_segments=8`, each 24-hour period
+is reduced to 8 segments of variable duration.
+
+The segmentation uses the same clustering machinery (constrained
+agglomerative clustering of adjacent timesteps) as the main clustering
+step. The `representation` parameter controls how segment values are
+computed (typically `"mean"`).
+
+After segmentation, the pipeline tracks two DataFrames:
+- `segmented_normalized` — for denormalization (step 12).
+- `predicted_segmented_df` — for reconstruction (step 14).
+
+---
+
+### Step 12: Denormalize
+
+| | |
+|---|---|
+| **Module** | `pipeline/normalize.py` |
+| **Function** | `denormalize()` |
+
+Inverts the transformations from step 1 to return values in original
+units:
+
+1. Undo column-mean normalization (multiply by stored mean).
+2. Inverse min-max scaling (via the stored `MinMaxScaler`).
+
+No weight removal is needed because weights were never baked into the data.
+
+The output is `typical_periods` — the final representative periods in
+the user's original units.
+
+---
+
+### Step 13: Bounds check
+
+| | |
+|---|---|
+| **Function** | `_warn_if_out_of_bounds()` |
+
+Warns if any column's max (or min) in the typical periods exceeds the
+original data's range beyond `numerical_tolerance`. This can happen with
+distribution representations or aggressive rescaling.
+
+---
+
+### Step 14: Reconstruct and compute accuracy
+
+| | |
+|---|---|
+| **Module** | `pipeline/accuracy.py` |
+| **Functions** | `reconstruct()`, `compute_accuracy()` |
+
+**Reconstruct:** Expands the typical periods back into a full-length time
+series by replacing each original period with its assigned cluster
+representative. The result has the same shape as the input data.
+
+**Accuracy:** Compares the reconstruction to the original in normalized
+(unweighted) space. Both are directly comparable — no weight compensation
+needed. Computes per-column:
+
+| Metric | Description |
+|---|---|
+| RMSE | Root mean square error. |
+| MAE | Mean absolute error. |
+| RMSE (duration) | RMSE on sorted (duration-curve) values — measures distribution fit. |
+
+---
+
+### Step 15: Build ClusteringResult
+
+| | |
+|---|---|
+| **Function** | `_build_clustering_result()` |
+
+Assembles all clustering metadata into a `ClusteringResult` object.
+This object is serializable (`.to_json()`, `.from_json()`) and supports
+transfer via `.apply(new_data)`.
+
+Key fields stored:
+- `cluster_assignments` — which cluster each original period belongs to.
+- `cluster_centers` — indices of medoid periods (if applicable).
+- `segment_assignments`, `segment_durations` — segmentation structure.
+- Config references for documentation.
+
+---
+
+### Step 16: Assemble PipelineResult
+
+The pipeline restores the original column order (columns are sorted
+alphabetically internally) and packs everything into a `PipelineResult`,
+which `aggregate()` converts to the user-facing `AggregationResult`.
+
+---
+
+## Working with results
+
+### AggregationResult
+
+The object returned by `tsam.aggregate()`:
+
+```python
+result = tsam.aggregate(df, n_clusters=8)
+
+# Core outputs
+result.cluster_representatives # DataFrame (cluster × timestep)
+result.cluster_counts # {cluster_id: count}
+result.cluster_assignments # array of cluster IDs per original period
+
+# Reconstruction
+result.original # original data
+result.reconstructed # reconstructed data
+result.residuals # original - reconstructed
+
+# Accuracy
+result.accuracy.rmse # per-column RMSE
+result.accuracy.mae # per-column MAE
+result.accuracy.rmse_duration # per-column duration-curve RMSE
+result.accuracy.summary # combined DataFrame
+
+# Metadata
+result.n_clusters
+result.n_timesteps_per_period
+result.n_segments # None if no segmentation
+result.clustering_duration # seconds
+
+# Assignments detail
+result.assignments # DataFrame with period_idx, timestep_idx, cluster_idx, [segment_idx]
+
+# Transfer
+result.clustering.apply(new_data) # apply same clustering to different data
+result.clustering.to_json("clustering.json") # save for later
+```
+
+---
+
+## Configuration reference
+
+### ClusterConfig
+
+```python
+from tsam import ClusterConfig
+
+cluster = ClusterConfig(
+ method="hierarchical", # clustering algorithm
+ representation="medoid", # how to compute cluster centers
+ scale_by_column_means=False, # divide by column mean before clustering
+ use_duration_curves=False, # sort values within periods before clustering
+ include_period_sums=False, # add period sums as extra clustering features
+ solver="highs", # MILP solver (for kmedoids only)
+)
+```
+
+### SegmentConfig
+
+```python
+from tsam import SegmentConfig
+
+segments = SegmentConfig(
+ n_segments=8, # number of segments per period
+ representation="mean", # how to compute segment values
+)
+```
+
+### ExtremeConfig
+
+```python
+from tsam import ExtremeConfig
+
+extremes = ExtremeConfig(
+ method="append", # how to integrate extreme periods
+ max_value=["demand"], # preserve peak-value periods
+ min_value=["solar"], # preserve minimum-value periods
+ max_period=["demand"], # preserve highest-average periods
+ min_period=[], # preserve lowest-average periods
+)
+```
+
+---
+
+## Developer reference
+
+??? info "`run_pipeline()` parameter sources"
+
+ | Parameter | Source: `aggregate()` | Source: `apply()` |
+ |---|---|---|
+ | `data` | user input | user input |
+ | `n_clusters` | user input | derived from assignments |
+ | `n_timesteps_per_period` | `period_duration / resolution` | stored value |
+ | `cluster` | user input or `ClusterConfig()` | `ClusterConfig(representation=...)` |
+ | `extremes` | user input or `None` | `None` (handled via `predef`) |
+ | `segments` | user input or `None` | reconstructed from stored fields |
+ | `rescale_cluster_periods` | `preserve_column_means` | stored value |
+ | `rescale_exclude_columns` | user input | stored value |
+ | `predef` | `None` | built from stored assignments |
+
+??? info "Config field consumption map"
+
+ **ClusterConfig:**
+
+ | Field | Step | Function |
+ |---|---|---|
+ | `method` | 4 | `cluster_periods()` |
+ | `representation` | 4, 11 (fallback) | clustering, `segment_typical_periods()` |
+ | `weights` | 2b | vectorized multiply → `weight_vector` |
+ | `normalize_column_means` | 1 | `normalize()` |
+ | `use_duration_curves` | 4 | branch gate |
+ | `include_period_sums` | 3 | `add_period_sum_features()` |
+ | `solver` | 4 | `cluster_periods()` |
+
+ **ExtremeConfig:** All fields consumed exclusively in step 6 by `add_extreme_periods()`.
+
+ **SegmentConfig:** Both fields consumed exclusively in step 11 by `segment_typical_periods()`.
+
+??? info "Output assembly: PipelineResult → AggregationResult"
+
+ | PipelineResult field | AggregationResult property |
+ |---|---|
+ | `typical_periods` | `cluster_representatives` |
+ | `cluster_counts` | `cluster_counts` |
+ | `original_data` | `.original` |
+ | `reconstructed_data` | `.reconstructed` |
+ | `accuracy_indicators` | `.accuracy` (RMSE, MAE, duration RMSE) |
+ | `clustering_result` | `.clustering` (for transfer/serialization) |
+ | `segmented_df` | `.assignments` (segment_idx column) |
+
+ Derived properties: `n_clusters`, `n_segments`, `cluster_assignments`, `residuals`, `plot`.
+
+## Source file map
+
+For developers navigating the codebase:
+
+| File | Role |
+|---|---|
+| `src/tsam/api.py` | User-facing `aggregate()` function and result builder |
+| `src/tsam/config.py` | `ClusterConfig`, `SegmentConfig`, `ExtremeConfig`, `ClusteringResult` |
+| `src/tsam/result.py` | `AggregationResult`, `AccuracyMetrics` |
+| `src/tsam/pipeline/__init__.py` | `run_pipeline()` — orchestrates all 16 steps |
+| `src/tsam/pipeline/normalize.py` | `normalize()`, `denormalize()` |
+| `src/tsam/pipeline/periods.py` | `unstack_to_periods()`, `add_period_sum_features()` |
+| `src/tsam/pipeline/clustering.py` | `cluster_periods()`, `cluster_sorted_periods()`, `use_predefined_assignments()` |
+| `src/tsam/pipeline/extremes.py` | `add_extreme_periods()` |
+| `src/tsam/pipeline/rescale.py` | `rescale_representatives()` |
+| `src/tsam/pipeline/segment.py` | `segment_typical_periods()` |
+| `src/tsam/pipeline/accuracy.py` | `reconstruct()`, `compute_accuracy()` |
+| `src/tsam/pipeline/types.py` | `PipelineResult`, `PeriodProfiles`, `NormalizedData`, `PredefParams` |
+| `src/tsam/timeseriesaggregation.py` | Legacy monolith (backward compatibility) |
diff --git a/docs/glossary.md b/docs/glossary.md
index d66c43cb..656dc5c5 100644
--- a/docs/glossary.md
+++ b/docs/glossary.md
@@ -15,6 +15,6 @@ Key concepts used in the ETHOS.TSAM API:
| `period_duration` | Length of each period. Accepts int/float (hours) or pandas Timedelta strings (e.g., `24`, `'24h'`, `'1d'`). |
| `temporal_resolution` | Time resolution of input data. Accepts float (hours) or pandas Timedelta strings (e.g., `1.0`, `'1h'`, `'15min'`). If not provided, inferred from the datetime index. |
| `cluster_assignments` | Array mapping each original period to its cluster index (0 to n_clusters-1). |
-| `cluster_weights` | Dictionary mapping cluster index to occurrence count (how many original periods each cluster represents). |
+| `cluster_counts` | Dictionary mapping cluster index to occurrence count (how many original periods each cluster represents). May be fractional for partial periods. |
| `segment_durations` | Nested tuple with duration (in timesteps) for each segment in each typical period. |
| `cluster_representatives` | MultiIndex DataFrame with aggregated data. Index levels are (cluster, timestep) or (cluster, segment) if segmented. |
diff --git a/docs/migration-guide.md b/docs/migration-guide.md
index df3fd359..c9f94c12 100644
--- a/docs/migration-guide.md
+++ b/docs/migration-guide.md
@@ -1,4 +1,62 @@
-# Migrating from ETHOS.TSAM v2 to v3 { #migration-guide }
+# Migration Guide { #migration-guide }
+
+## Migrating from v3 to v4 { #migration-v3-to-v4 }
+
+tsam v4 is a pipeline rewrite of the internals. The `tsam.aggregate()` API
+and the legacy `TimeSeriesAggregation` class both still work, but there are
+behavioral changes that may affect your results.
+
+### Weight semantics
+
+`weights` (top-level parameter to `aggregate()`) now affects
+**only** the clustering distance calculation. Previously, weights were baked
+into normalized data, which forced rescaling, reconstruction, and accuracy
+computation to compensate. In v4, all those steps operate on unweighted data.
+
+**What changes:**
+
+- Cluster *assignments* are identical (the weighted distance matrix is
+ mathematically equivalent).
+- With medoid or maxoid representation and non-uniform weights, the selected
+ representative may differ because the medoid is now chosen in the unweighted
+ output space.
+- Across all golden regression tests, the **only** affected configuration is
+ `hierarchical_weighted` — everything else is bit-identical.
+
+**Action required:** If you use non-uniform `weights` with medoid or maxoid
+representation, verify that your downstream results are acceptable. For most
+users, this change is invisible.
+
+### Column order (new API only)
+
+`cluster_representatives`, `reconstructed`, and `original` now return
+columns in the same order as the input DataFrame. Previously, columns were
+alphabetically sorted.
+
+The legacy `TimeSeriesAggregation` class preserves alphabetical sorting for
+backward compatibility.
+
+**Action required:** If your code indexes columns by position (e.g.,
+`df.iloc[:, 0]`), verify that the order matches your expectation.
+
+### Renamed property
+
+`AggregationResult.cluster_weights` has been renamed to `cluster_counts`
+to avoid confusion with per-column clustering weights. The old name still
+works but emits a `FutureWarning`.
+
+### Internal changes (no action required)
+
+- The pipeline has been decomposed into stateless functions in
+ `src/tsam/pipeline/`. Both `tsam.aggregate()` and
+ `TimeSeriesAggregation` delegate to `run_pipeline()`.
+- All internal identifiers have been renamed from camelCase to snake_case.
+ The legacy class accepts both naming conventions for its constructor
+ parameters.
+
+---
+
+## Migrating from ETHOS.TSAM v2 to v3 { #migration-v2-to-v3 }
ETHOS.TSAM v3 replaces the class-based API with a functional API.
The old `TimeSeriesAggregation` class still works but is deprecated
@@ -237,7 +295,7 @@ object with everything attached.
| `agg.predictOriginalData()` | `result.reconstructed` |
| `agg.accuracyIndicators()` | `result.accuracy.summary` |
| `agg.clusterOrder` | `result.cluster_assignments` |
-| `agg.clusterPeriodNoOccur` | `result.cluster_weights` |
+| `agg.clusterPeriodNoOccur` | `result.cluster_counts` |
| `agg.clusterCenterIndices` | `result.clustering.cluster_centers` |
| `agg.timeSeries` | `result.original` |
| *(no equivalent)* | `result.residuals` |
diff --git a/docs/notebooks/optimization_input.ipynb b/docs/notebooks/optimization_input.ipynb
index 949ee057..ec064920 100644
--- a/docs/notebooks/optimization_input.ipynb
+++ b/docs/notebooks/optimization_input.ipynb
@@ -159,7 +159,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "Calculates how the original index is represented by the old index"
+ "Calculates how the original index is represented by the cluster index"
]
},
{
@@ -168,9 +168,7 @@
"metadata": {},
"outputs": [],
"source": [
- "# Advanced: Access the internal aggregation object for features not exposed in the public API.\n",
- "# Note: The _aggregation attribute is internal and may change in future versions.\n",
- "index_matching = result._aggregation.indexMatching()\n",
+ "index_matching = result.assignments\n",
"index_matching.head()"
]
},
@@ -191,7 +189,7 @@
" 0, index=index_matching.index, columns=result.period_index\n",
")\n",
"for col in visualization_df.columns:\n",
- " visualization_df.loc[index_matching[\"PeriodNum\"] == col, col] = 1"
+ " visualization_df.loc[index_matching[\"cluster_idx\"] == col, col] = 1"
]
},
{
@@ -221,7 +219,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "**i. cluster_weights** - The occurrence count of each typical period for weighting in the objective function.\n",
+ "**i. cluster_counts** - The occurrence count of each typical period for weighting in the objective function.\n",
"\n",
"Note: Period three is only partially evaluated since its appearance at the end of the year exceeds the original time series."
]
@@ -232,7 +230,7 @@
"metadata": {},
"outputs": [],
"source": [
- "result.cluster_weights"
+ "result.cluster_counts"
]
},
{
@@ -241,7 +239,7 @@
"metadata": {},
"outputs": [],
"source": [
- "weights = pd.Series(result.cluster_weights)\n",
+ "weights = pd.Series(result.cluster_counts)\n",
"fig = px.bar(\n",
" x=weights.index,\n",
" y=weights.values,\n",
@@ -255,7 +253,7 @@
"metadata": {},
"source": [
"**ii. Accessing period data by index**\n",
- "
Access aggregated time series values using period and time step indices. This uses internal API methods that may change in future versions."
+ "
Access aggregated time series values using period and time step indices from the cluster_representatives DataFrame."
]
},
{
@@ -264,10 +262,8 @@
"metadata": {},
"outputs": [],
"source": [
- "# Advanced: Access internal dictionary-style data access.\n",
- "# Note: The _aggregation attribute and its methods are internal and may change in future versions.\n",
- "agg = result._aggregation\n",
- "agg.clusterPeriodDict[\"GHI\"][(agg.clusterPeriodIdx[3], agg.stepIdx[12])]"
+ "# Access a specific value: GHI for cluster period 3, timestep 12\n",
+ "result.cluster_representatives.loc[(3, 12), \"GHI\"]"
]
},
{
diff --git a/mkdocs.yml b/mkdocs.yml
index 9a389992..e558d325 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -72,6 +72,7 @@ nav:
- notebooks/building_energy_system.ipynb
- Learn:
- Mathematical Background: background/math.md
+ - Pipeline Guide: background/pipeline_guide.md
- Further Reading: further-reading.md
- API Reference: api/
- About:
diff --git a/src/tsam/__init__.py b/src/tsam/__init__.py
index ac02563e..62b755c4 100644
--- a/src/tsam/__init__.py
+++ b/src/tsam/__init__.py
@@ -34,8 +34,8 @@
The original class-based API is still available:
>>> from tsam.timeseriesaggregation import TimeSeriesAggregation
->>> agg = TimeSeriesAggregation(df, noTypicalPeriods=8)
->>> typical = agg.createTypicalPeriods()
+>>> agg = TimeSeriesAggregation(df, no_typical_periods=8)
+>>> typical = agg.create_typical_periods()
"""
from tsam.api import aggregate, unstack_to_periods
@@ -62,6 +62,7 @@ def __getattr__(name: str):
SegmentConfig,
)
from tsam.exceptions import LegacyAPIWarning
+from tsam.options import options
from tsam.result import AccuracyMetrics, AggregationResult
# Legacy imports for backward compatibility
@@ -84,6 +85,7 @@ def __getattr__(name: str):
"SegmentConfig",
"TimeSeriesAggregation",
"aggregate",
+ "options",
"plot",
"tuning",
"unstackToPeriods", # Legacy alias
diff --git a/src/tsam/api.py b/src/tsam/api.py
index 75a82f20..8f2f3045 100644
--- a/src/tsam/api.py
+++ b/src/tsam/api.py
@@ -2,27 +2,22 @@
from __future__ import annotations
-import re
-import warnings
-from typing import cast
+from typing import TYPE_CHECKING, cast
import pandas as pd
from tsam.config import (
- EXTREME_METHOD_MAPPING,
- METHOD_MAPPING,
- REPRESENTATION_MAPPING,
ClusterConfig,
- ClusteringResult,
- Distribution,
ExtremeConfig,
- MinMaxMean,
- Representation,
SegmentConfig,
)
-from tsam.exceptions import LegacyAPIWarning
-from tsam.result import AccuracyMetrics, AggregationResult
-from tsam.timeseriesaggregation import TimeSeriesAggregation, unstackToPeriods
+from tsam.pipeline import run_pipeline
+from tsam.pipeline.types import PipelineConfig
+from tsam.result import AggregationResult
+from tsam.weights import validate_weights
+
+if TYPE_CHECKING:
+ from tsam.pipeline.types import PipelineResult
def _weighted_mean(
@@ -84,7 +79,7 @@ def _parse_duration_hours(value: int | float | str, param_name: str) -> float:
"""Parse a duration value to hours.
Accepts:
- - int/float: interpreted as hours (e.g., 24 → 24.0 hours)
+ - int/float: interpreted as hours (e.g., 24 -> 24.0 hours)
- str: pandas Timedelta string (e.g., '24h', '1d', '15min')
Returns duration in hours as float.
@@ -93,8 +88,6 @@ def _parse_duration_hours(value: int | float | str, param_name: str) -> float:
return float(value)
if isinstance(value, str):
try:
- # Normalize deprecated lowercase day alias: '1d' → '1D' (pandas 4+)
- value = re.sub(r"(?<=[0-9])d(?![a-z])", "D", value)
td = pd.Timedelta(value)
return td.total_seconds() / 3600
except ValueError as e:
@@ -194,7 +187,7 @@ def aggregate(
Object containing:
- cluster_representatives: DataFrame with aggregated periods
- cluster_assignments: Which cluster each original period belongs to
- - cluster_weights: Occurrence count per cluster
+ - cluster_counts: Occurrence count per cluster
- accuracy: RMSE, MAE metrics
- Methods: to_dict()
@@ -278,26 +271,24 @@ def aggregate(
if cluster is None:
cluster = ClusterConfig()
+ # Compute n_timesteps_per_period
+ if temporal_resolution is not None:
+ resolution = temporal_resolution
+ else:
+ # Infer resolution from data index
+ if isinstance(data.index, pd.DatetimeIndex) and len(data.index) > 1:
+ resolution = (data.index[1] - data.index[0]).total_seconds() / 3600
+ else:
+ resolution = 1.0 # Default to hourly
+
+ n_timesteps_per_period = int(period_duration / resolution)
+
# Validate segments against data
if segments is not None:
- # Calculate timesteps per period
- if temporal_resolution is not None:
- timesteps_per_period = int(period_duration / temporal_resolution)
- else:
- # Infer resolution from data index
- if isinstance(data.index, pd.DatetimeIndex) and len(data.index) > 1:
- inferred_resolution = (
- data.index[1] - data.index[0]
- ).total_seconds() / 3600
- timesteps_per_period = int(period_duration / inferred_resolution)
- else:
- # Fall back to assuming hourly resolution
- timesteps_per_period = int(period_duration)
-
- if segments.n_segments > timesteps_per_period:
+ if segments.n_segments > n_timesteps_per_period:
raise ValueError(
f"n_segments ({segments.n_segments}) cannot exceed "
- f"timesteps per period ({timesteps_per_period})"
+ f"timesteps per period ({n_timesteps_per_period})"
)
# Validate extreme columns exist in data
@@ -319,50 +310,54 @@ def aggregate(
"Use only the top-level weights parameter."
)
if cluster.weights is not None:
- # Deprecation warning already emitted by ClusterConfig.__post_init__
weights = cluster.weights
- # Validate weight columns exist
- if weights is not None:
- missing = set(weights.keys()) - set(data.columns)
- if missing:
- raise ValueError(f"Weight columns not found in data: {missing}")
+ # Validate and normalize weights
+ validated = validate_weights(data.columns, weights)
+ if validated is not cluster.weights:
+ cluster = ClusterConfig(
+ method=cluster.method,
+ representation=cluster.representation,
+ weights=validated,
+ scale_by_column_means=cluster.scale_by_column_means,
+ use_duration_curves=cluster.use_duration_curves,
+ include_period_sums=cluster.include_period_sums,
+ solver=cluster.solver,
+ )
- # Build old API parameters
- old_params = _build_old_params(
- data=data,
+ # Build pipeline config
+ cfg = PipelineConfig(
n_clusters=n_clusters,
- period_duration=period_duration,
- temporal_resolution=temporal_resolution,
+ n_timesteps_per_period=n_timesteps_per_period,
cluster=cluster,
+ extremes=extremes if extremes and extremes.has_extremes() else None,
segments=segments,
- extremes=extremes,
- weights=weights,
- preserve_column_means=preserve_column_means,
+ rescale_cluster_periods=preserve_column_means,
rescale_exclude_columns=rescale_exclude_columns,
round_decimals=round_decimals,
numerical_tolerance=numerical_tolerance,
+ temporal_resolution=temporal_resolution,
)
- # Run aggregation using old implementation (suppress deprecation warning for internal use)
- with warnings.catch_warnings():
- warnings.simplefilter("ignore", LegacyAPIWarning)
- agg = TimeSeriesAggregation(**old_params)
- cluster_representatives = agg.createTypicalPeriods()
+ result = run_pipeline(data=data, cfg=cfg)
+
+ return _build_aggregation_result(result, is_transferred=False)
- # Rename index levels for consistency with new API terminology
- cluster_representatives = cluster_representatives.rename_axis(
+
+def _build_aggregation_result(
+ result: PipelineResult,
+ is_transferred: bool,
+) -> AggregationResult:
+ """Convert PipelineResult to the user-facing AggregationResult."""
+ # Rename index levels
+ cluster_representatives = result.typical_periods.rename_axis(
index={"PeriodNum": "cluster", "TimeStep": "timestep"}
)
- # Build accuracy metrics
- accuracy_df = agg.accuracyIndicators()
-
# Build rescale deviations DataFrame
- rescale_deviations_dict = getattr(agg, "_rescaleDeviations", {})
- if rescale_deviations_dict:
+ if result.rescale_deviations:
rescale_deviations = pd.DataFrame.from_dict(
- rescale_deviations_dict, orient="index"
+ result.rescale_deviations, orient="index"
)
rescale_deviations.index.name = "column"
else:
@@ -370,307 +365,28 @@ def aggregate(
columns=["deviation_pct", "converged", "iterations"]
)
- accuracy = AccuracyMetrics(
- rmse=accuracy_df["RMSE"],
- mae=accuracy_df["MAE"],
- rmse_duration=accuracy_df["RMSE_duration"],
- rescale_deviations=rescale_deviations,
- weighted_rmse=_weighted_rms(accuracy_df["RMSE"], weights),
- weighted_mae=_weighted_mean(accuracy_df["MAE"], weights),
- weighted_rmse_duration=_weighted_rms(accuracy_df["RMSE_duration"], weights),
- )
-
- # Build ClusteringResult
- clustering_result = _build_clustering_result(
- agg=agg,
- n_segments=segments.n_segments if segments else None,
- cluster_config=cluster,
- segment_config=segments,
- extremes_config=extremes,
- weights=weights,
- preserve_column_means=preserve_column_means,
- rescale_exclude_columns=rescale_exclude_columns,
- temporal_resolution=temporal_resolution,
- )
-
- # Compute segment_durations as tuple of tuples
- segment_durations_tuple = None
- if segments and hasattr(agg, "segmentedNormalizedTypicalPeriods"):
- segmented_df = agg.segmentedNormalizedTypicalPeriods
- segment_durations_tuple = tuple(
- tuple(
- int(seg_dur)
- for _seg_step, seg_dur, _orig_start in segmented_df.loc[
- period_idx
- ].index
- )
- for period_idx in segmented_df.index.get_level_values(0).unique()
- )
+ # Get segment_durations from ClusteringResult
+ segment_durations = result.clustering_result.segment_durations
- # Build result object
return AggregationResult(
cluster_representatives=cluster_representatives,
- cluster_weights=dict(agg.clusterPeriodNoOccur),
- n_timesteps_per_period=agg.timeStepsPerPeriod,
- segment_durations=segment_durations_tuple,
- accuracy=accuracy,
- clustering_duration=getattr(agg, "clusteringDuration", 0.0),
- clustering=clustering_result,
- is_transferred=False,
- _aggregation=agg,
- )
-
-
-def _build_clustering_result(
- agg: TimeSeriesAggregation,
- n_segments: int | None,
- cluster_config: ClusterConfig,
- segment_config: SegmentConfig | None,
- extremes_config: ExtremeConfig | None,
- weights: dict[str, float] | None = None,
- preserve_column_means: bool = True,
- rescale_exclude_columns: list[str] | None = None,
- temporal_resolution: float | None = None,
-) -> ClusteringResult:
- """Build ClusteringResult from a TimeSeriesAggregation object."""
- # Get cluster centers (convert to Python ints for JSON serialization)
- # Handle extreme periods based on method:
- # - new_cluster/append: append extreme period indices (creates additional clusters)
- # - replace: keep original cluster centers
- # Note: replace creates a hybrid representation (some columns from medoid, some
- # from extreme period) that cannot be perfectly reproduced during transfer
- cluster_centers: tuple[int, ...] | None = None
- if agg.clusterCenterIndices is not None:
- center_indices = [int(x) for x in agg.clusterCenterIndices]
-
- if (
- hasattr(agg, "extremePeriods")
- and agg.extremePeriods
- and extremes_config is not None
- and extremes_config.method in ("new_cluster", "append")
- ):
- # Add extreme period indices as new cluster centers
- for period_type in agg.extremePeriods:
- center_indices.append(int(agg.extremePeriods[period_type]["stepNo"]))
-
- cluster_centers = tuple(center_indices)
-
- # Compute segment data if segmentation was used
- segment_assignments: tuple[tuple[int, ...], ...] | None = None
- segment_durations: tuple[tuple[int, ...], ...] | None = None
- segment_centers: tuple[tuple[int, ...], ...] | None = None
-
- if n_segments is not None and hasattr(agg, "segmentedNormalizedTypicalPeriods"):
- segmented_df = agg.segmentedNormalizedTypicalPeriods
- segment_assignments_list = []
- segment_durations_list = []
-
- for period_idx in segmented_df.index.get_level_values(0).unique():
- period_data = segmented_df.loc[period_idx]
- # Index levels: Segment Step, Segment Duration, Original Start Step
- assignments = []
- durations = []
- for seg_step, seg_dur, _orig_start in period_data.index:
- assignments.extend([int(seg_step)] * int(seg_dur))
- durations.append(int(seg_dur))
- segment_assignments_list.append(tuple(assignments))
- segment_durations_list.append(tuple(durations))
-
- segment_assignments = tuple(segment_assignments_list)
- segment_durations = tuple(segment_durations_list)
-
- # Extract segment center indices (only available for medoid/maxoid representations)
- if (
- hasattr(agg, "segmentCenterIndices")
- and agg.segmentCenterIndices is not None
- ):
- # Check if any period has center indices (None for mean representation)
- if all(pc is not None for pc in agg.segmentCenterIndices):
- segment_centers = tuple(
- tuple(int(x) for x in period_centers)
- for period_centers in agg.segmentCenterIndices
- )
-
- # Extract representation from configs
- representation = cluster_config.get_representation()
- segment_representation = segment_config.representation if segment_config else None
-
- # Extract extreme cluster indices if extremes were used
- extreme_cluster_indices: tuple[int, ...] | None = None
- if hasattr(agg, "extremeClusterIdx") and agg.extremeClusterIdx:
- extreme_cluster_indices = tuple(int(x) for x in agg.extremeClusterIdx)
-
- return ClusteringResult(
- period_duration=agg.hoursPerPeriod,
- cluster_assignments=tuple(int(x) for x in agg.clusterOrder),
- cluster_centers=cluster_centers,
- segment_assignments=segment_assignments,
+ cluster_counts=result.cluster_counts,
+ n_timesteps_per_period=result.n_timesteps_per_period,
segment_durations=segment_durations,
- segment_centers=segment_centers,
- preserve_column_means=preserve_column_means,
- rescale_exclude_columns=tuple(rescale_exclude_columns)
- if rescale_exclude_columns
- else None,
- representation=representation,
- segment_representation=segment_representation,
- temporal_resolution=temporal_resolution,
- n_timesteps_per_period=agg.timeStepsPerPeriod,
- extreme_cluster_indices=extreme_cluster_indices,
- weights=weights,
- cluster_config=cluster_config,
- segment_config=segment_config,
- extremes_config=extremes_config,
+ clustering_duration=result.clustering_duration,
+ clustering=result.clustering_result,
+ is_transferred=is_transferred,
+ _original_data=result.original_data,
+ _reconstructed_data=result.reconstructed_data,
+ _time_index=result.time_index,
+ _norm_values=result._norm_values,
+ _normalized_predicted=result._normalized_predicted,
+ _rescale_deviations=rescale_deviations,
+ _segmented_df=result.segmented_df,
+ _weights=result.clustering_result.weights,
)
-def _apply_representation_params(
- params: dict, representation: Representation, columns: list[str]
-) -> None:
- """Apply representation parameters to the old API params dict.
-
- Handles both string shortcuts and typed representation objects
- (Distribution, MinMaxMean).
- """
- if isinstance(representation, Distribution):
- if representation.preserve_minmax:
- params["representationMethod"] = "distributionAndMinMaxRepresentation"
- else:
- params["representationMethod"] = "distributionRepresentation"
- params["distributionPeriodWise"] = representation.scope == "cluster"
- elif isinstance(representation, MinMaxMean):
- params["representationMethod"] = "minmaxmeanRepresentation"
- # Build representationDict: columns not in max/min default to mean
- rep_dict: dict[str, str] = {}
- max_set = set(representation.max_columns)
- min_set = set(representation.min_columns)
- for col in columns:
- if col in max_set:
- rep_dict[col] = "max"
- elif col in min_set:
- rep_dict[col] = "min"
- else:
- rep_dict[col] = "mean"
- params["representationDict"] = rep_dict
- else:
- # String representation
- rep_mapped = REPRESENTATION_MAPPING.get(representation)
- if rep_mapped is None:
- raise ValueError(
- f"Unknown representation method: {representation!r}. "
- f"Valid options: {list(REPRESENTATION_MAPPING.keys())}"
- )
- params["representationMethod"] = rep_mapped
-
-
-def _build_old_params(
- data: pd.DataFrame,
- n_clusters: int,
- period_duration: float,
- temporal_resolution: float | None,
- cluster: ClusterConfig,
- segments: SegmentConfig | None,
- extremes: ExtremeConfig | None,
- preserve_column_means: bool,
- rescale_exclude_columns: list[str] | None,
- round_decimals: int | None,
- numerical_tolerance: float,
- weights: dict[str, float] | None = None,
- *,
- # Predefined parameters (used internally by ClusteringResult.apply())
- predef_cluster_assignments: tuple[int, ...] | None = None,
- predef_cluster_centers: tuple[int, ...] | None = None,
- predef_extreme_cluster_indices: tuple[int, ...] | None = None,
- predef_segment_assignments: tuple[tuple[int, ...], ...] | None = None,
- predef_segment_durations: tuple[tuple[int, ...], ...] | None = None,
- predef_segment_centers: tuple[tuple[int, ...], ...] | None = None,
-) -> dict:
- """Build parameters for the old TimeSeriesAggregation API."""
- params: dict = {
- "timeSeries": data,
- "noTypicalPeriods": n_clusters,
- "hoursPerPeriod": period_duration,
- "rescaleClusterPeriods": preserve_column_means,
- "rescaleExcludeColumns": rescale_exclude_columns,
- "numericalTolerance": numerical_tolerance,
- }
-
- if temporal_resolution is not None:
- params["resolution"] = temporal_resolution
-
- if round_decimals is not None:
- params["roundOutput"] = round_decimals
-
- # Cluster config
- method = METHOD_MAPPING.get(cluster.method)
- if method is None:
- raise ValueError(
- f"Unknown cluster method: {cluster.method!r}. "
- f"Valid options: {list(METHOD_MAPPING.keys())}"
- )
- params["clusterMethod"] = method
-
- representation = cluster.get_representation()
- _apply_representation_params(params, representation, data.columns.tolist())
- params["sortValues"] = cluster.use_duration_curves
- params["sameMean"] = cluster.normalize_column_means
- params["evalSumPeriods"] = cluster.include_period_sums
- params["solver"] = cluster.solver
-
- if weights is not None:
- params["weightDict"] = weights
-
- if predef_cluster_assignments is not None:
- params["predefClusterOrder"] = list(predef_cluster_assignments)
-
- if predef_cluster_centers is not None:
- params["predefClusterCenterIndices"] = list(predef_cluster_centers)
-
- if predef_extreme_cluster_indices is not None:
- params["predefExtremeClusterIdx"] = list(predef_extreme_cluster_indices)
-
- # Segmentation config
- if segments is not None:
- params["segmentation"] = True
- params["noSegments"] = segments.n_segments
- seg_rep = segments.representation
- if isinstance(seg_rep, (Distribution, MinMaxMean)):
- seg_params: dict = {}
- _apply_representation_params(seg_params, seg_rep, data.columns.tolist())
- params["segmentRepresentationMethod"] = seg_params["representationMethod"]
- if "distributionPeriodWise" in seg_params:
- params["distributionPeriodWise"] = seg_params["distributionPeriodWise"]
- if "representationDict" in seg_params:
- params["representationDict"] = seg_params["representationDict"]
- else:
- params["segmentRepresentationMethod"] = REPRESENTATION_MAPPING.get(
- seg_rep, "meanRepresentation"
- )
-
- # Predefined segment parameters (from ClusteringResult)
- if predef_segment_assignments is not None:
- params["predefSegmentOrder"] = [list(s) for s in predef_segment_assignments]
- if predef_segment_durations is not None:
- params["predefSegmentDurations"] = [
- list(s) for s in predef_segment_durations
- ]
- if predef_segment_centers is not None:
- params["predefSegmentCenters"] = [list(s) for s in predef_segment_centers]
- else:
- params["segmentation"] = False
-
- # Extreme config
- if extremes is not None and extremes.has_extremes():
- params["extremePeriodMethod"] = EXTREME_METHOD_MAPPING[extremes.method]
- params["addPeakMax"] = extremes.max_value
- params["addPeakMin"] = extremes.min_value
- params["addMeanMax"] = extremes.max_period
- params["addMeanMin"] = extremes.min_period
- else:
- params["extremePeriodMethod"] = "None"
-
- return params
-
-
def unstack_to_periods(
data: pd.DataFrame,
period_duration: int | float | str = 24,
@@ -725,7 +441,7 @@ def unstack_to_periods(
f"data timestep resolution ({timestep_hours}h)"
)
- with warnings.catch_warnings():
- warnings.simplefilter("ignore", LegacyAPIWarning)
- unstacked, _ = unstackToPeriods(data.copy(), timesteps_per_period)
- return cast("pd.DataFrame", unstacked)
+ from tsam.pipeline.periods import unstack_to_periods as _unstack
+
+ profiles = _unstack(data.copy(), timesteps_per_period)
+ return cast("pd.DataFrame", profiles.profiles_dataframe)
diff --git a/src/tsam/config.py b/src/tsam/config.py
index daf7c60f..c13bc496 100644
--- a/src/tsam/config.py
+++ b/src/tsam/config.py
@@ -12,6 +12,14 @@
if TYPE_CHECKING:
from tsam.result import AggregationResult
+
+def _infer_resolution(data: pd.DataFrame) -> float:
+ """Infer temporal resolution from data index."""
+ if isinstance(data.index, pd.DatetimeIndex) and len(data.index) > 1:
+ return (data.index[1] - data.index[0]).total_seconds() / 3600
+ return 1.0
+
+
# Type aliases for clarity
ClusterMethod = Literal[
"averaging",
@@ -114,25 +122,6 @@ def from_dict(cls, data: dict) -> MinMaxMean:
Representation = RepresentationMethod | Distribution | MinMaxMean
-def _resolve_representation(rep: Representation) -> Representation:
- """Normalize a string representation shortcut to an object when needed.
-
- Returns the input unchanged for objects and simple string methods
- (mean, medoid, maxoid). Converts distribution/distribution_minmax/minmax_mean
- strings to their corresponding objects.
- """
- if isinstance(rep, (Distribution, MinMaxMean)):
- return rep
- if rep == "distribution":
- return Distribution()
- if rep == "distribution_minmax":
- return Distribution(preserve_minmax=True)
- if rep == "minmax_mean":
- return MinMaxMean()
- # Simple string methods: mean, medoid, maxoid
- return rep
-
-
def _representation_to_dict(rep: Representation) -> str | dict[str, Any]:
"""Serialize a representation value to a JSON-compatible format."""
if isinstance(rep, (Distribution, MinMaxMean)):
@@ -153,7 +142,6 @@ def _representation_from_dict(data: str | dict) -> Representation:
raise ValueError(f"Unknown representation type: {rep_type!r}")
-@dataclass(frozen=True)
class ClusterConfig:
"""Configuration for the clustering algorithm.
@@ -196,12 +184,37 @@ class ClusterConfig:
weights : dict[str, float], optional
.. deprecated::
Pass ``weights`` as a top-level parameter to
- :func:`~tsam.aggregate` instead. Weights affect all pipeline
- stages, not just clustering.
-
- normalize_column_means : bool, default False
- Normalize all columns to the same mean before clustering.
+ :func:`~tsam.aggregate` instead.
+
+ Per-column importance factors for clustering. Higher weight = more
+ influence. Example: ``{"demand": 2.0, "solar": 1.0}``
+
+ Weights scale the normalized data used for all clustering-related
+ decisions. They affect:
+
+ 1. **Clustering distance**: Columns with higher weight contribute
+ more to the distance metric, so clusters form around patterns
+ in high-weight columns.
+ 2. **Medoid/maxoid selection**: These representations pick an actual
+ period by cross-column distance. Weights change which period is
+ "closest to the centroid" or "farthest from other centroids."
+ 3. **Segmentation**: Segment boundaries within typical periods are
+ determined in weighted space. High-weight columns have more
+ influence on where boundaries fall.
+
+ Other representations (mean, distribution, minmax_mean) are
+ **weight-invariant** — the weight multiplies in and divides back
+ out, producing the same result regardless of weight values.
+
+ Weights are removed before producing final outputs. They do not
+ affect normalization, rescaling, denormalization, reconstruction,
+ or accuracy computation. Columns not listed default to weight 1.0.
+
+ scale_by_column_means : bool, default False
+ Divide each column by its mean after MinMax normalization, so all
+ columns have equal mean before clustering.
Useful when columns have very different scales.
+ (Previously called ``normalize_column_means``.)
use_duration_curves : bool, default False
Sort values within each period before clustering.
@@ -216,16 +229,44 @@ class ClusterConfig:
Options: "highs" (default, open source), "cbc", "gurobi", "cplex"
"""
- method: ClusterMethod = "hierarchical"
- representation: Representation | None = None
- weights: dict[str, float] | None = field(default=None, repr=False)
- normalize_column_means: bool = False
- use_duration_curves: bool = False
- include_period_sums: bool = False
- solver: Solver = "highs"
-
- def __post_init__(self) -> None:
- if self.weights is not None:
+ method: ClusterMethod
+ representation: Representation | None
+ weights: dict[str, float] | None
+ scale_by_column_means: bool
+ use_duration_curves: bool
+ include_period_sums: bool
+ solver: Solver
+
+ __slots__ = (
+ "include_period_sums",
+ "method",
+ "representation",
+ "scale_by_column_means",
+ "solver",
+ "use_duration_curves",
+ "weights",
+ )
+
+ def __init__(
+ self,
+ method: ClusterMethod = "hierarchical",
+ representation: Representation | None = None,
+ weights: dict[str, float] | None = None,
+ scale_by_column_means: bool = False,
+ use_duration_curves: bool = False,
+ include_period_sums: bool = False,
+ solver: Solver = "highs",
+ # Backward compat alias
+ normalize_column_means: bool | None = None,
+ ) -> None:
+ if normalize_column_means is not None:
+ warnings.warn(
+ "'normalize_column_means' is deprecated, use 'scale_by_column_means'.",
+ FutureWarning,
+ stacklevel=2,
+ )
+ scale_by_column_means = normalize_column_means
+ if weights is not None:
warnings.warn(
"Passing weights via ClusterConfig is deprecated. "
"Pass weights as a top-level parameter to aggregate() instead, "
@@ -233,6 +274,43 @@ def __post_init__(self) -> None:
DeprecationWarning,
stacklevel=2,
)
+ object.__setattr__(self, "method", method)
+ object.__setattr__(self, "representation", representation)
+ object.__setattr__(self, "weights", weights)
+ object.__setattr__(self, "scale_by_column_means", scale_by_column_means)
+ object.__setattr__(self, "use_duration_curves", use_duration_curves)
+ object.__setattr__(self, "include_period_sums", include_period_sums)
+ object.__setattr__(self, "solver", solver)
+
+ def __setattr__(self, name: str, value: object) -> None:
+ raise AttributeError("ClusterConfig is immutable")
+
+ def __delattr__(self, name: str) -> None:
+ raise AttributeError("ClusterConfig is immutable")
+
+ def __getstate__(self) -> dict:
+ return {s: getattr(self, s) for s in self.__slots__}
+
+ def __setstate__(self, state: dict) -> None:
+ for key, value in state.items():
+ object.__setattr__(self, key, value)
+
+ def __eq__(self, other: object) -> bool:
+ if not isinstance(other, ClusterConfig):
+ return NotImplemented
+ return all(getattr(self, s) == getattr(other, s) for s in self.__slots__)
+
+ def __hash__(self) -> int:
+ return hash(tuple(getattr(self, s) for s in self.__slots__))
+
+ def __repr__(self) -> str:
+ parts = ", ".join(f"{s}={getattr(self, s)!r}" for s in self.__slots__)
+ return f"ClusterConfig({parts})"
+
+ @property
+ def normalize_column_means(self) -> bool:
+ """Deprecated alias for ``scale_by_column_means``."""
+ return self.scale_by_column_means
def get_representation(self) -> Representation:
"""Get the representation, using default if not specified."""
@@ -257,8 +335,8 @@ def to_dict(self) -> dict[str, Any]:
result["representation"] = _representation_to_dict(self.representation)
if self.weights is not None:
result["weights"] = self.weights
- if self.normalize_column_means:
- result["normalize_column_means"] = self.normalize_column_means
+ if self.scale_by_column_means:
+ result["scale_by_column_means"] = self.scale_by_column_means
if self.use_duration_curves:
result["use_duration_curves"] = self.use_duration_curves
if self.include_period_sums:
@@ -278,7 +356,10 @@ def from_dict(cls, data: dict) -> ClusterConfig:
method=data.get("method", "hierarchical"),
representation=representation,
weights=data.get("weights"),
- normalize_column_means=data.get("normalize_column_means", False),
+ scale_by_column_means=data.get(
+ "scale_by_column_means",
+ data.get("normalize_column_means", False),
+ ),
use_duration_curves=data.get("use_duration_curves", False),
include_period_sums=data.get("include_period_sums", False),
solver=data.get("solver", "highs"),
@@ -335,6 +416,16 @@ def from_dict(cls, data: dict) -> SegmentConfig:
)
+def _get_version() -> str:
+ """Get tsam version string for ClusteringResult."""
+ import importlib.metadata
+
+ try:
+ return importlib.metadata.version("tsam")
+ except importlib.metadata.PackageNotFoundError:
+ return "unknown"
+
+
def _validate_disaggregate_input(
data: pd.DataFrame,
clustering: ClusteringResult,
@@ -581,6 +672,9 @@ class ClusteringResult:
segment_config: SegmentConfig | None = None
extremes_config: ExtremeConfig | None = None
+ # === Format version ===
+ version: str | None = None
+
def __post_init__(self) -> None:
if self.segment_assignments is not None and self.segment_durations is None:
raise ValueError(
@@ -595,6 +689,128 @@ def __post_init__(self) -> None:
"segment_assignments must be provided when segment_centers is specified"
)
+ @classmethod
+ def from_pipeline(
+ cls,
+ *,
+ cluster_center_indices: list | None,
+ extreme_periods_info: dict,
+ extremes_config: ExtremeConfig | None,
+ cluster_order: list | np.ndarray,
+ segmented_df: pd.DataFrame | None,
+ segment_center_indices: list | None,
+ n_timesteps_per_period: int,
+ temporal_resolution: float | None,
+ original_data: pd.DataFrame,
+ cluster_config: ClusterConfig,
+ segment_config: SegmentConfig | None,
+ rescale_cluster_periods: bool,
+ rescale_exclude_columns: list[str] | None,
+ extreme_cluster_idx: list[int],
+ ) -> ClusteringResult:
+ """Build a ClusteringResult from pipeline intermediate data."""
+ # Get cluster centers
+ cluster_centers: tuple[int, ...] | None = None
+ if cluster_center_indices is not None:
+ center_indices = [int(x) for x in cluster_center_indices]
+
+ if (
+ extreme_periods_info
+ and extremes_config is not None
+ and extremes_config.method in ("new_cluster", "append")
+ ):
+ for period_type in extreme_periods_info:
+ center_indices.append(
+ int(extreme_periods_info[period_type]["step_no"])
+ )
+
+ cluster_centers = tuple(center_indices)
+
+ # Compute segment data if segmentation was used
+ segment_assignments: tuple[tuple[int, ...], ...] | None = None
+ segment_durations: tuple[tuple[int, ...], ...] | None = None
+ segment_centers: tuple[tuple[int, ...], ...] | None = None
+
+ if segment_config is not None and segmented_df is not None:
+ segment_assignments, segment_durations, segment_centers = (
+ cls._extract_segment_data(segmented_df, segment_center_indices)
+ )
+
+ # Extract representation from configs
+ representation = cluster_config.get_representation()
+ segment_representation = (
+ segment_config.representation if segment_config else None
+ )
+
+ # Extract extreme cluster indices
+ extreme_cluster_indices_tuple: tuple[int, ...] | None = None
+ if extreme_cluster_idx:
+ extreme_cluster_indices_tuple = tuple(int(x) for x in extreme_cluster_idx)
+
+ # Compute period_duration
+ effective_resolution = (
+ temporal_resolution
+ if temporal_resolution is not None
+ else _infer_resolution(original_data)
+ )
+ period_duration = n_timesteps_per_period * effective_resolution
+
+ return cls(
+ period_duration=period_duration,
+ cluster_assignments=tuple(int(x) for x in cluster_order),
+ cluster_centers=cluster_centers,
+ segment_assignments=segment_assignments,
+ segment_durations=segment_durations,
+ segment_centers=segment_centers,
+ preserve_column_means=rescale_cluster_periods,
+ rescale_exclude_columns=tuple(rescale_exclude_columns)
+ if rescale_exclude_columns
+ else None,
+ representation=representation,
+ segment_representation=segment_representation,
+ temporal_resolution=temporal_resolution,
+ n_timesteps_per_period=n_timesteps_per_period,
+ extreme_cluster_indices=extreme_cluster_indices_tuple,
+ weights=dict(cluster_config.weights) if cluster_config.weights else None,
+ cluster_config=cluster_config,
+ segment_config=segment_config,
+ extremes_config=extremes_config,
+ version=_get_version(),
+ )
+
+ @staticmethod
+ def _extract_segment_data(
+ segmented_df: pd.DataFrame,
+ segment_center_indices: list | None,
+ ) -> tuple[
+ tuple[tuple[int, ...], ...],
+ tuple[tuple[int, ...], ...],
+ tuple[tuple[int, ...], ...] | None,
+ ]:
+ """Extract segment assignments, durations, and centers from a segmented DataFrame."""
+ assignments_list = []
+ durations_list = []
+
+ for period_idx in segmented_df.index.get_level_values(0).unique():
+ period_data = segmented_df.loc[period_idx]
+ assignments = []
+ durations = []
+ for seg_step, seg_dur, _orig_start in period_data.index:
+ assignments.extend([int(seg_step)] * int(seg_dur))
+ durations.append(int(seg_dur))
+ assignments_list.append(tuple(assignments))
+ durations_list.append(tuple(durations))
+
+ centers: tuple[tuple[int, ...], ...] | None = None
+ if segment_center_indices is not None:
+ if all(pc is not None for pc in segment_center_indices):
+ centers = tuple(
+ tuple(int(x) for x in period_centers)
+ for period_centers in segment_center_indices
+ )
+
+ return tuple(assignments_list), tuple(durations_list), centers
+
@property
def n_clusters(self) -> int:
"""Number of clusters (typical periods)."""
@@ -689,6 +905,7 @@ def to_dict(self) -> dict[str, Any]:
"""Convert to dictionary for JSON serialization."""
# Transfer fields (always included)
result: dict[str, Any] = {
+ "version": self.version or _get_version(),
"period_duration": self.period_duration,
"cluster_assignments": list(self.cluster_assignments),
"n_timesteps_per_period": self.n_timesteps_per_period,
@@ -736,6 +953,7 @@ def from_dict(cls, data: dict) -> ClusteringResult:
"n_timesteps_per_period": data["n_timesteps_per_period"],
"preserve_column_means": data.get("preserve_column_means", True),
"representation": _representation_from_dict(rep_data),
+ "version": data.get("version"),
}
if "cluster_centers" in data:
kwargs["cluster_centers"] = tuple(data["cluster_centers"])
@@ -949,11 +1167,9 @@ def apply(
>>> clustering = ClusteringResult.from_json("clustering.json")
>>> result = clustering.apply(df)
"""
- # Import here to avoid circular imports
- from tsam.api import _build_old_params
- from tsam.exceptions import LegacyAPIWarning
- from tsam.result import AccuracyMetrics, AggregationResult
- from tsam.timeseriesaggregation import TimeSeriesAggregation
+ from tsam.api import _build_aggregation_result
+ from tsam.pipeline import run_pipeline
+ from tsam.pipeline.types import PipelineConfig, PredefParams
# Warn if using replace extreme method (transfer is not exact)
if (
@@ -979,12 +1195,8 @@ def apply(
)
# Validate n_timesteps_per_period matches data
- # Infer timestep duration from data if not provided
if effective_temporal_resolution is None:
- if isinstance(data.index, pd.DatetimeIndex) and len(data.index) > 1:
- inferred = (data.index[1] - data.index[0]).total_seconds() / 3600
- else:
- inferred = 1.0 # Default to hourly
+ inferred = _infer_resolution(data)
else:
inferred = effective_temporal_resolution
@@ -1015,111 +1227,51 @@ def apply(
# Use stored segment config if available, otherwise build from transfer fields
segments: SegmentConfig | None = None
- n_segments: int | None = None
if self.segment_assignments is not None and self.segment_durations is not None:
- n_segments = len(self.segment_durations[0])
+ n_segments_val = len(self.segment_durations[0])
segments = self.segment_config or SegmentConfig(
- n_segments=n_segments,
+ n_segments=n_segments_val,
representation=self.segment_representation or "mean",
)
- # Build old API parameters, passing predefined values directly
- # Note: Don't pass extremes config - extreme clusters are handled via
- # extreme_cluster_indices and representations are computed from
- # the periods assigned to those clusters in cluster_assignments
- old_params = _build_old_params(
- data=data,
+ # Run pipeline with predefined parameters
+ predef = PredefParams(
+ cluster_order=list(self.cluster_assignments),
+ cluster_center_indices=list(self.cluster_centers)
+ if self.cluster_centers
+ else None,
+ extreme_cluster_idx=list(self.extreme_cluster_indices)
+ if self.extreme_cluster_indices
+ else None,
+ segment_order=[list(s) for s in self.segment_assignments]
+ if self.segment_assignments
+ else None,
+ segment_durations=[list(s) for s in self.segment_durations]
+ if self.segment_durations
+ else None,
+ segment_centers=[list(s) for s in self.segment_centers]
+ if self.segment_centers
+ else None,
+ )
+
+ cfg = PipelineConfig(
n_clusters=self.n_clusters,
- period_duration=self.period_duration,
- temporal_resolution=effective_temporal_resolution,
+ n_timesteps_per_period=self.n_timesteps_per_period,
cluster=cluster,
segments=segments,
- extremes=None,
- preserve_column_means=self.preserve_column_means,
+ rescale_cluster_periods=self.preserve_column_means,
rescale_exclude_columns=list(self.rescale_exclude_columns)
if self.rescale_exclude_columns
else None,
round_decimals=round_decimals,
numerical_tolerance=numerical_tolerance,
- weights=self.weights,
- # Predefined values from this ClusteringResult
- predef_cluster_assignments=self.cluster_assignments,
- predef_cluster_centers=self.cluster_centers,
- predef_extreme_cluster_indices=self.extreme_cluster_indices,
- predef_segment_assignments=self.segment_assignments,
- predef_segment_durations=self.segment_durations,
- predef_segment_centers=self.segment_centers,
- )
-
- # Run aggregation using old implementation (suppress deprecation warning)
- with warnings.catch_warnings():
- warnings.simplefilter("ignore", LegacyAPIWarning)
- agg = TimeSeriesAggregation(**old_params)
- cluster_representatives = agg.createTypicalPeriods()
-
- # Rename index levels for consistency with new API terminology
- cluster_representatives = cluster_representatives.rename_axis(
- index={"PeriodNum": "cluster", "TimeStep": "timestep"}
- )
-
- # Build accuracy metrics
- accuracy_df = agg.accuracyIndicators()
-
- # Build rescale deviations DataFrame
- rescale_deviations_dict = getattr(agg, "_rescaleDeviations", {})
- if rescale_deviations_dict:
- rescale_deviations = pd.DataFrame.from_dict(
- rescale_deviations_dict, orient="index"
- )
- rescale_deviations.index.name = "column"
- else:
- rescale_deviations = pd.DataFrame(
- columns=["deviation_pct", "converged", "iterations"]
- )
-
- from tsam.api import _weighted_mean, _weighted_rms
-
- accuracy = AccuracyMetrics(
- rmse=accuracy_df["RMSE"],
- mae=accuracy_df["MAE"],
- rmse_duration=accuracy_df["RMSE_duration"],
- rescale_deviations=rescale_deviations,
- weighted_rmse=_weighted_rms(accuracy_df["RMSE"], self.weights),
- weighted_mae=_weighted_mean(accuracy_df["MAE"], self.weights),
- weighted_rmse_duration=_weighted_rms(
- accuracy_df["RMSE_duration"], self.weights
- ),
- )
-
- # Build ClusteringResult - preserve stored values
- from tsam.api import _build_clustering_result
-
- clustering_result = _build_clustering_result(
- agg=agg,
- n_segments=n_segments,
- cluster_config=cluster,
- segment_config=segments,
- extremes_config=self.extremes_config,
- weights=self.weights,
- preserve_column_means=self.preserve_column_means,
- rescale_exclude_columns=list(self.rescale_exclude_columns)
- if self.rescale_exclude_columns
- else None,
temporal_resolution=effective_temporal_resolution,
+ predef=predef,
)
- # Build result object
- return AggregationResult(
- cluster_representatives=cluster_representatives,
- cluster_weights=dict(agg.clusterPeriodNoOccur),
- n_timesteps_per_period=agg.timeStepsPerPeriod,
- segment_durations=self.segment_durations,
- accuracy=accuracy,
- clustering_duration=getattr(agg, "clusteringDuration", 0.0),
- clustering=clustering_result,
- is_transferred=True,
- _aggregation=agg,
- )
+ result = run_pipeline(data=data, cfg=cfg)
+
+ return _build_aggregation_result(result, is_transferred=True)
@dataclass(frozen=True)
@@ -1153,7 +1305,6 @@ class ExtremeConfig:
min_period : list[str], optional
Column names where the period with minimum total should be preserved.
Example: ["wind_generation"] to preserve lowest wind day.
-
"""
method: ExtremeMethod = "append"
@@ -1193,29 +1344,3 @@ def from_dict(cls, data: dict) -> ExtremeConfig:
max_period=data.get("max_period", []),
min_period=data.get("min_period", []),
)
-
-
-# Mapping from new API names to old API names
-METHOD_MAPPING: dict[ClusterMethod, str] = {
- "averaging": "averaging",
- "kmeans": "k_means",
- "kmedoids": "k_medoids",
- "kmaxoids": "k_maxoids",
- "hierarchical": "hierarchical",
- "contiguous": "adjacent_periods",
-}
-
-REPRESENTATION_MAPPING: dict[RepresentationMethod, str] = {
- "mean": "meanRepresentation",
- "medoid": "medoidRepresentation",
- "maxoid": "maxoidRepresentation",
- "distribution": "distributionRepresentation",
- "distribution_minmax": "distributionAndMinMaxRepresentation",
- "minmax_mean": "minmaxmeanRepresentation",
-}
-
-EXTREME_METHOD_MAPPING: dict[ExtremeMethod, str] = {
- "append": "append",
- "replace": "replace_cluster_center",
- "new_cluster": "new_cluster_center",
-}
diff --git a/src/tsam/hyperparametertuning.py b/src/tsam/hyperparametertuning.py
index b06f64c9..2687c7da 100644
--- a/src/tsam/hyperparametertuning.py
+++ b/src/tsam/hyperparametertuning.py
@@ -8,20 +8,22 @@
from tsam.timeseriesaggregation import TimeSeriesAggregation
-def getNoPeriodsForDataReduction(noRawTimeSteps, segmentsPerPeriod, dataReduction):
+def get_no_periods_for_data_reduction(
+ n_raw_timesteps, segments_per_period, data_reduction
+):
"""
Identifies the maximum number of periods which can be set to achieve the required data reduction.
- :param noRawTimeSteps: Number of original time steps. required
- :type noRawTimeSteps: int
+ :param n_raw_timesteps: Number of original time steps. required
+ :type n_raw_timesteps: int
- :param segmentsPerPeriod: Segments per period. required
- :type segmentsPerPeriod: int
+ :param segments_per_period: Segments per period. required
+ :type segments_per_period: int
- :param dataReduction: Factor by which the resulting dataset should be reduced. required
- :type dataReduction: float
+ :param data_reduction: Factor by which the resulting dataset should be reduced. required
+ :type data_reduction: float
- :returns: **noTypicalPeriods** -- Number of typical periods that can be set.
+ :returns: **no_typical_periods** -- Number of typical periods that can be set.
.. deprecated::
This function is deprecated along with the HyperTunedAggregations class.
@@ -32,23 +34,25 @@ def getNoPeriodsForDataReduction(noRawTimeSteps, segmentsPerPeriod, dataReductio
LegacyAPIWarning,
stacklevel=2,
)
- return int(np.floor(dataReduction * float(noRawTimeSteps) / segmentsPerPeriod))
+ return int(np.floor(data_reduction * float(n_raw_timesteps) / segments_per_period))
-def getNoSegmentsForDataReduction(noRawTimeSteps, typicalPeriods, dataReduction):
+def get_no_segments_for_data_reduction(
+ n_raw_timesteps, typical_periods, data_reduction
+):
"""
Identifies the maximum number of segments which can be set to achieve the required data reduction.
- :param noRawTimeSteps: Number of original time steps. required
- :type noRawTimeSteps: int
+ :param n_raw_timesteps: Number of original time steps. required
+ :type n_raw_timesteps: int
- :param typicalPeriods: Number of typical periods. required
- :type typicalPeriods: int
+ :param typical_periods: Number of typical periods. required
+ :type typical_periods: int
- :param dataReduction: Factor by which the resulting dataset should be reduced. required
- :type dataReduction: float
+ :param data_reduction: Factor by which the resulting dataset should be reduced. required
+ :type data_reduction: float
- :returns: **segmentsPerPeriod** -- Number of segments per period that can be set.
+ :returns: **segments_per_period** -- Number of segments per period that can be set.
.. deprecated::
This function is deprecated along with the HyperTunedAggregations class.
@@ -59,24 +63,44 @@ def getNoSegmentsForDataReduction(noRawTimeSteps, typicalPeriods, dataReduction)
LegacyAPIWarning,
stacklevel=2,
)
- return int(np.floor(dataReduction * float(noRawTimeSteps) / typicalPeriods))
+ return int(np.floor(data_reduction * float(n_raw_timesteps) / typical_periods))
+
+
+# Backward-compatible function aliases (deprecated)
+getNoPeriodsForDataReduction = get_no_periods_for_data_reduction
+getNoSegmentsForDataReduction = get_no_segments_for_data_reduction
class HyperTunedAggregations:
- def __init__(self, base_aggregation, saveAggregationHistory=True):
+ def __init__(self, base_aggregation, save_aggregation_history=True, **kwargs):
"""
A class that does a parameter variation and tuning of the aggregation itself.
:param base_aggregation: TimeSeriesAggregation object which is used as basis for tuning the hyper parameters. required
:type base_aggregation: TimeSeriesAggregation
- :param saveAggregationHistory: Defines if all aggregations that are created during the tuning and iterations shall be saved under self.aggregationHistory.
- :type saveAggregationHistory: boolean
+ :param save_aggregation_history: Defines if all aggregations that are created during the tuning and iterations shall be saved under self.aggregation_history.
+ :type save_aggregation_history: boolean
.. deprecated::
Use :func:`tsam.tuning.find_optimal_combination` or
:func:`tsam.tuning.find_pareto_front` instead.
"""
+ # Translate deprecated camelCase kwargs
+ if "saveAggregationHistory" in kwargs:
+ warnings.warn(
+ "'saveAggregationHistory' is deprecated, use 'save_aggregation_history'.",
+ FutureWarning,
+ stacklevel=2,
+ )
+ if "save_aggregation_history" in kwargs:
+ raise TypeError(
+ "Cannot specify both 'saveAggregationHistory' and 'save_aggregation_history'"
+ )
+ save_aggregation_history = kwargs.pop("saveAggregationHistory")
+ if kwargs:
+ raise TypeError(f"Unexpected keyword arguments: {set(kwargs)}")
+
warnings.warn(
"HyperTunedAggregations will be removed in tsam v4.0. "
"Use tsam.tuning.find_optimal_combination() or tsam.tuning.find_pareto_front() instead.",
@@ -90,131 +114,133 @@ def __init__(self, base_aggregation, saveAggregationHistory=True):
"base_aggregation has to be an TimeSeriesAggregation object"
)
- self._alterableAggregation = copy.deepcopy(self.base_aggregation)
+ self._alterable_aggregation = copy.deepcopy(self.base_aggregation)
- self.saveAggregationHistory = saveAggregationHistory
+ self.save_aggregation_history = save_aggregation_history
- self._segmentHistory = []
+ self._segment_history = []
- self._periodHistory = []
+ self._period_history = []
- self._RMSEHistory = []
+ self._rmse_history = []
- if self.saveAggregationHistory:
- self.aggregationHistory = []
+ if self.save_aggregation_history:
+ self.aggregation_history = []
- def _testAggregation(self, noTypicalPeriods, noSegments):
+ def _test_aggregation(self, no_typical_periods, no_segments):
"""
Tests the aggregation for a set of typical periods and segments and returns the RMSE
"""
- self._segmentHistory.append(noSegments)
+ self._segment_history.append(no_segments)
- self._periodHistory.append(noTypicalPeriods)
+ self._period_history.append(no_typical_periods)
- self._alterableAggregation.noTypicalPeriods = noTypicalPeriods
+ self._alterable_aggregation.no_typical_periods = no_typical_periods
- self._alterableAggregation.noSegments = noSegments
+ self._alterable_aggregation.no_segments = no_segments
- self._alterableAggregation.createTypicalPeriods()
+ self._alterable_aggregation.create_typical_periods()
- self._alterableAggregation.predictOriginalData()
+ self._alterable_aggregation.predict_original_data()
- RMSE = self._alterableAggregation.totalAccuracyIndicators()["RMSE"]
+ rmse = self._alterable_aggregation.total_accuracy_indicators()["RMSE"]
- self._RMSEHistory.append(RMSE)
+ self._rmse_history.append(rmse)
- if self.saveAggregationHistory:
- self.aggregationHistory.append(copy.copy(self._alterableAggregation))
+ if self.save_aggregation_history:
+ self.aggregation_history.append(copy.copy(self._alterable_aggregation))
- return RMSE
+ return rmse
- def _deleteTestHistory(self, index):
+ def _delete_test_history(self, index):
"""
- Delelets the defined index from the test history
+ Deletes the defined index from the test history
"""
- del self._segmentHistory[index]
- del self._periodHistory[index]
- del self._RMSEHistory[index]
+ del self._segment_history[index]
+ del self._period_history[index]
+ del self._rmse_history[index]
- if self.saveAggregationHistory:
- del self.aggregationHistory[index]
+ if self.save_aggregation_history:
+ del self.aggregation_history[index]
- def identifyOptimalSegmentPeriodCombination(self, dataReduction):
+ def identify_optimal_segment_period_combination(self, data_reduction):
"""
Identifies the optimal combination of number of typical periods and number of segments for a given data reduction set.
- :param dataReduction: Factor by which the resulting dataset should be reduced. required
- :type dataReduction: float
+ :param data_reduction: Factor by which the resulting dataset should be reduced. required
+ :type data_reduction: float
- :returns: **noSegments, noTypicalperiods** -- The optimal combination of segments and typical periods for the given optimization set.
+ :returns: **no_segments, no_typical_periods** -- The optimal combination of segments and typical periods for the given optimization set.
"""
if not self.base_aggregation.segmentation:
raise ValueError(
"This function does only make sense in combination with 'segmentation' activated."
)
- noRawTimeSteps = len(self.base_aggregation.timeSeries.index)
+ n_raw_timesteps = len(self.base_aggregation.time_series.index)
- _maxPeriods = int(
- float(noRawTimeSteps) / self.base_aggregation.timeStepsPerPeriod
+ _max_periods = int(
+ float(n_raw_timesteps) / self.base_aggregation.time_steps_per_period
)
- _maxSegments = self.base_aggregation.timeStepsPerPeriod
+ _max_segments = self.base_aggregation.time_steps_per_period
# save RMSE
- RMSE_history = []
+ rmse_history = []
# correct 0 index of python
- possibleSegments = np.arange(_maxSegments) + 1
- possiblePeriods = np.arange(_maxPeriods) + 1
+ possible_segments = np.arange(_max_segments) + 1
+ possible_periods = np.arange(_max_periods) + 1
# number of time steps of all combinations of segments and periods
- combinedTimeSteps = np.outer(possibleSegments, possiblePeriods)
+ combined_timesteps = np.outer(possible_segments, possible_periods)
# reduce to valid combinations for targeted data reduction
- reductionValidCombinations = combinedTimeSteps <= noRawTimeSteps * dataReduction
+ reduction_valid_combinations = (
+ combined_timesteps <= n_raw_timesteps * data_reduction
+ )
# number of time steps for all feasible combinations
- reductionValidTimsteps = combinedTimeSteps * reductionValidCombinations
+ reduction_valid_timesteps = combined_timesteps * reduction_valid_combinations
# identify max segments and max period combination
- optimalPeriods = np.zeros_like(reductionValidTimsteps)
- optimalPeriods[
- np.arange(reductionValidTimsteps.shape[0]),
- reductionValidTimsteps.argmax(axis=1),
+ optimal_periods = np.zeros_like(reduction_valid_timesteps)
+ optimal_periods[
+ np.arange(reduction_valid_timesteps.shape[0]),
+ reduction_valid_timesteps.argmax(axis=1),
] = 1
- optimalSegments = np.zeros_like(reductionValidTimsteps)
- optimalSegments[
- reductionValidTimsteps.argmax(axis=0),
- np.arange(reductionValidTimsteps.shape[1]),
+ optimal_segments = np.zeros_like(reduction_valid_timesteps)
+ optimal_segments[
+ reduction_valid_timesteps.argmax(axis=0),
+ np.arange(reduction_valid_timesteps.shape[1]),
] = 1
- optimalIndexCombo = np.nonzero(optimalPeriods * optimalSegments)
+ optimal_index_combo = np.nonzero(optimal_periods * optimal_segments)
- for segmentIx, periodIx in tqdm.tqdm(
- zip(optimalIndexCombo[0], optimalIndexCombo[1])
+ for segment_ix, period_ix in tqdm.tqdm(
+ zip(optimal_index_combo[0], optimal_index_combo[1])
):
# derive new typical periods and derive rmse
- RMSE_history.append(
- self._testAggregation(
- possiblePeriods[periodIx], possibleSegments[segmentIx]
+ rmse_history.append(
+ self._test_aggregation(
+ possible_periods[period_ix], possible_segments[segment_ix]
)
)
# take the negative backwards index with the minimal RMSE
- min_index = -list(reversed(RMSE_history)).index(min(RMSE_history)) - 1
- RMSE_min = RMSE_history[min_index]
+ min_index = -list(reversed(rmse_history)).index(min(rmse_history)) - 1
+ rmse_min = rmse_history[min_index]
- noTypicalPeriods = self._periodHistory[min_index]
- noSegments = self._segmentHistory[min_index]
+ no_typical_periods = self._period_history[min_index]
+ no_segments = self._segment_history[min_index]
# and return the segment and typical period pair
- return noSegments, noTypicalPeriods, RMSE_min
+ return no_segments, no_typical_periods, rmse_min
- def identifyParetoOptimalAggregation(self, untilTotalTimeSteps=None):
+ def identify_pareto_optimal_aggregation(self, until_total_timesteps=None):
"""
Identifies the pareto-optimal combination of number of typical periods and number of segments along with a steepest decent approach, starting from the aggregation to a single period and a single segment up to the representation of the full time series.
- :param untilTotalTimeSteps: Number of timesteps until which the pareto-front should be determined. If None, the maximum number of timesteps is chosen.
- :type untilTotalTimeSteps: int
+ :param until_total_timesteps: Number of timesteps until which the pareto-front should be determined. If None, the maximum number of timesteps is chosen.
+ :type until_total_timesteps: int
:returns: None. Check aggregation history for results. All typical Periods in scaled form.
@@ -224,68 +250,91 @@ def identifyParetoOptimalAggregation(self, untilTotalTimeSteps=None):
"This function does only make sense in combination with 'segmentation' activated."
)
- noRawTimeSteps = len(self.base_aggregation.timeSeries.index)
+ n_raw_timesteps = len(self.base_aggregation.time_series.index)
- _maxPeriods = int(
- float(noRawTimeSteps) / self.base_aggregation.timeStepsPerPeriod
+ _max_periods = int(
+ float(n_raw_timesteps) / self.base_aggregation.time_steps_per_period
)
- _maxSegments = self.base_aggregation.timeStepsPerPeriod
+ _max_segments = self.base_aggregation.time_steps_per_period
- if untilTotalTimeSteps is None:
- untilTotalTimeSteps = noRawTimeSteps
+ if until_total_timesteps is None:
+ until_total_timesteps = n_raw_timesteps
- progressBar = tqdm.tqdm(total=untilTotalTimeSteps)
+ progress_bar = tqdm.tqdm(total=until_total_timesteps)
# starting point
- noTypicalPeriods = 1
- noSegments = 1
- _RMSE_0 = self._testAggregation(noTypicalPeriods, noSegments)
+ no_typical_periods = 1
+ no_segments = 1
+ _rmse_0 = self._test_aggregation(no_typical_periods, no_segments)
# loop until either segments or periods have reached their maximum
while (
- noTypicalPeriods < _maxPeriods
- and noSegments < _maxSegments
- and (noSegments + 1) * noTypicalPeriods <= untilTotalTimeSteps
- and noSegments * (noTypicalPeriods + 1) <= untilTotalTimeSteps
+ no_typical_periods < _max_periods
+ and no_segments < _max_segments
+ and (no_segments + 1) * no_typical_periods <= until_total_timesteps
+ and no_segments * (no_typical_periods + 1) <= until_total_timesteps
):
# test for more segments
- RMSE_segments = self._testAggregation(noTypicalPeriods, noSegments + 1)
+ rmse_segments = self._test_aggregation(no_typical_periods, no_segments + 1)
# test for more periods
- RMSE_periods = self._testAggregation(noTypicalPeriods + 1, noSegments)
+ rmse_periods = self._test_aggregation(no_typical_periods + 1, no_segments)
# RMSE old
- RMSE_old = self._RMSEHistory[-3]
+ rmse_old = self._rmse_history[-3]
# segment gradient (RMSE improvement per increased time step number)
# for segments: for each period on segment added
- RMSE_segment_gradient = (RMSE_old - RMSE_segments) / noTypicalPeriods
+ rmse_segment_gradient = (rmse_old - rmse_segments) / no_typical_periods
# for periods: one period with no of segments
- RMSE_periods_gradient = (RMSE_old - RMSE_periods) / noSegments
+ rmse_periods_gradient = (rmse_old - rmse_periods) / no_segments
# go along the steeper gradient
- if RMSE_periods_gradient > RMSE_segment_gradient:
- noTypicalPeriods += 1
- # and delete the search direction which was not persued
- self._deleteTestHistory(-2)
+ if rmse_periods_gradient > rmse_segment_gradient:
+ no_typical_periods += 1
+ # and delete the search direction which was not pursued
+ self._delete_test_history(-2)
else:
- noSegments += 1
- self._deleteTestHistory(-1)
- progressBar.update(noSegments * noTypicalPeriods - progressBar.n)
+ no_segments += 1
+ self._delete_test_history(-1)
+ progress_bar.update(no_segments * no_typical_periods - progress_bar.n)
# afterwards loop over periods and segments exclusively until maximum is reached
while (
- noTypicalPeriods < _maxPeriods
- and noSegments * (noTypicalPeriods + 1) <= untilTotalTimeSteps
+ no_typical_periods < _max_periods
+ and no_segments * (no_typical_periods + 1) <= until_total_timesteps
):
- noTypicalPeriods += 1
- self._testAggregation(noTypicalPeriods, noSegments)
- progressBar.update(noSegments * noTypicalPeriods - progressBar.n)
+ no_typical_periods += 1
+ self._test_aggregation(no_typical_periods, no_segments)
+ progress_bar.update(no_segments * no_typical_periods - progress_bar.n)
while (
- noSegments < _maxSegments
- and (noSegments + 1) * noTypicalPeriods <= untilTotalTimeSteps
+ no_segments < _max_segments
+ and (no_segments + 1) * no_typical_periods <= until_total_timesteps
):
- noSegments += 1
- self._testAggregation(noTypicalPeriods, noSegments)
- progressBar.update(noSegments * noTypicalPeriods - progressBar.n)
+ no_segments += 1
+ self._test_aggregation(no_typical_periods, no_segments)
+ progress_bar.update(no_segments * no_typical_periods - progress_bar.n)
return
+
+ # Backward-compatible method aliases (deprecated)
+ identifyOptimalSegmentPeriodCombination = (
+ identify_optimal_segment_period_combination
+ )
+ identifyParetoOptimalAggregation = identify_pareto_optimal_aggregation
+
+ # Backward-compatible property aliases (deprecated)
+ @property
+ def aggregationHistory(self):
+ return self.aggregation_history
+
+ @property
+ def _RMSEHistory(self):
+ return self._rmse_history
+
+ @property
+ def _segmentHistory(self):
+ return self._segment_history
+
+ @property
+ def _periodHistory(self):
+ return self._period_history
diff --git a/src/tsam/options.py b/src/tsam/options.py
new file mode 100644
index 00000000..d012d2a3
--- /dev/null
+++ b/src/tsam/options.py
@@ -0,0 +1,78 @@
+"""Global runtime-tunable options for tsam."""
+
+from __future__ import annotations
+
+
+class Options:
+ """Runtime-tunable options for tsam.
+
+ Access and modify via the module-level ``tsam.options`` instance.
+
+ Examples
+ --------
+ >>> import tsam
+ >>> tsam.options.rescale_max_iterations = 50
+ >>> tsam.options.rescale_tolerance = 1e-8
+ >>> tsam.options.min_weight = 1e-4
+ >>> tsam.options.reset() # restore defaults
+ """
+
+ def __init__(self) -> None:
+ self._rescale_max_iterations: int = 20
+ self._rescale_tolerance: float = 1e-6
+ self._min_weight: float = 1e-6
+
+ @property
+ def rescale_max_iterations(self) -> int:
+ """Maximum iterations for rescaling convergence (default: 20)."""
+ return self._rescale_max_iterations
+
+ @rescale_max_iterations.setter
+ def rescale_max_iterations(self, value: int) -> None:
+ if not isinstance(value, int) or value < 1:
+ raise ValueError(
+ f"rescale_max_iterations must be a positive integer, got {value}"
+ )
+ self._rescale_max_iterations = value
+
+ @property
+ def rescale_tolerance(self) -> float:
+ """Convergence tolerance for rescaling (default: 1e-6)."""
+ return self._rescale_tolerance
+
+ @rescale_tolerance.setter
+ def rescale_tolerance(self, value: float) -> None:
+ if not isinstance(value, (int, float)) or value <= 0:
+ raise ValueError(
+ f"rescale_tolerance must be a positive number, got {value}"
+ )
+ self._rescale_tolerance = float(value)
+
+ @property
+ def min_weight(self) -> float:
+ """Minimum allowed column weight (default: 1e-6)."""
+ return self._min_weight
+
+ @min_weight.setter
+ def min_weight(self, value: float) -> None:
+ if not isinstance(value, (int, float)) or value <= 0:
+ raise ValueError(f"min_weight must be a positive number, got {value}")
+ self._min_weight = float(value)
+
+ def reset(self) -> None:
+ """Reset all options to defaults."""
+ self._rescale_max_iterations = 20
+ self._rescale_tolerance = 1e-6
+ self._min_weight = 1e-6
+
+ def __repr__(self) -> str:
+ return (
+ f"Options(\n"
+ f" rescale_max_iterations={self.rescale_max_iterations},\n"
+ f" rescale_tolerance={self.rescale_tolerance},\n"
+ f" min_weight={self.min_weight},\n"
+ f")"
+ )
+
+
+options = Options()
diff --git a/src/tsam/periodAggregation.py b/src/tsam/periodAggregation.py
deleted file mode 100644
index 96829715..00000000
--- a/src/tsam/periodAggregation.py
+++ /dev/null
@@ -1,150 +0,0 @@
-import numpy as np
-
-from tsam.representations import representations
-
-
-def aggregatePeriods(
- candidates,
- n_clusters=8,
- n_iter=100,
- clusterMethod="k_means",
- solver="highs",
- representationMethod=None,
- representationDict=None,
- distributionPeriodWise=True,
- timeStepsPerPeriod=None,
- n_extra_columns=0,
-):
- """
- Clusters the data based on one of the cluster methods:
- 'averaging', 'k_means', 'exact k_medoid' or 'hierarchical'
-
- :param candidates: Dissimilarity matrix where each row represents a candidate. required
- :type candidates: np.ndarray
-
- :param n_clusters: Number of aggregated cluster. optional (default: 8)
- :type n_clusters: integer
-
- :param n_iter: Only required for the number of starts of the k-mean algorithm. optional (default: 10)
- :type n_iter: integer
-
- :param clusterMethod: Chosen clustering algorithm. Possible values are
- 'averaging','k_means','exact k_medoid' or 'hierarchical'. optional (default: 'k_means')
- :type clusterMethod: string
-
- :param n_extra_columns: Number of extra columns appended to candidates for
- clustering (e.g. period sums) that should be excluded from the
- representation step. optional (default: 0)
- :type n_extra_columns: integer
- """
- # Candidates used for representation exclude extra evaluation columns
- repr_candidates = (
- candidates[:, :-n_extra_columns] if n_extra_columns else candidates
- )
-
- # cluster the data
- if clusterMethod == "averaging":
- n_sets = len(candidates)
- if n_sets % n_clusters == 0:
- cluster_size = int(n_sets / n_clusters)
- clusterOrder = [
- [n_cluster] * cluster_size for n_cluster in range(n_clusters)
- ]
- else:
- cluster_size = int(n_sets / n_clusters)
- clusterOrder = [
- [n_cluster] * cluster_size for n_cluster in range(n_clusters)
- ]
- clusterOrder.append(
- [n_clusters - 1] * int(n_sets - cluster_size * n_clusters)
- )
- clusterOrder = np.hstack(np.array(clusterOrder, dtype=object))
- clusterCenters, clusterCenterIndices = representations(
- repr_candidates,
- clusterOrder,
- default="meanRepresentation",
- representationMethod=representationMethod,
- representationDict=representationDict,
- distributionPeriodWise=distributionPeriodWise,
- timeStepsPerPeriod=timeStepsPerPeriod,
- )
-
- if clusterMethod == "k_means":
- from sklearn.cluster import KMeans
-
- k_means = KMeans(n_clusters=n_clusters, max_iter=1000, n_init=n_iter, tol=1e-4)
-
- clusterOrder = k_means.fit_predict(candidates)
- # get with own mean representation to avoid numerical trouble caused by sklearn
- clusterCenters, clusterCenterIndices = representations(
- repr_candidates,
- clusterOrder,
- default="meanRepresentation",
- representationMethod=representationMethod,
- representationDict=representationDict,
- distributionPeriodWise=distributionPeriodWise,
- timeStepsPerPeriod=timeStepsPerPeriod,
- )
-
- if clusterMethod == "k_medoids":
- from tsam.utils.k_medoids_exact import KMedoids
-
- k_medoid = KMedoids(n_clusters=n_clusters, solver=solver)
-
- clusterOrder = k_medoid.fit_predict(candidates)
- clusterCenters, clusterCenterIndices = representations(
- repr_candidates,
- clusterOrder,
- default="medoidRepresentation",
- representationMethod=representationMethod,
- representationDict=representationDict,
- distributionPeriodWise=distributionPeriodWise,
- timeStepsPerPeriod=timeStepsPerPeriod,
- )
-
- if clusterMethod == "k_maxoids":
- from tsam.utils.k_maxoids import KMaxoids
-
- k_maxoid = KMaxoids(n_clusters=n_clusters)
-
- clusterOrder = k_maxoid.fit_predict(candidates)
- clusterCenters, clusterCenterIndices = representations(
- repr_candidates,
- clusterOrder,
- default="maxoidRepresentation",
- representationMethod=representationMethod,
- representationDict=representationDict,
- distributionPeriodWise=distributionPeriodWise,
- timeStepsPerPeriod=timeStepsPerPeriod,
- )
-
- if clusterMethod == "hierarchical" or clusterMethod == "adjacent_periods":
- if n_clusters == 1:
- clusterOrder = np.asarray([0] * len(candidates))
- else:
- from sklearn.cluster import AgglomerativeClustering
-
- if clusterMethod == "hierarchical":
- clustering = AgglomerativeClustering(
- n_clusters=n_clusters, linkage="ward"
- )
- elif clusterMethod == "adjacent_periods":
- adjacencyMatrix = np.eye(len(candidates), k=1) + np.eye(
- len(candidates), k=-1
- )
- clustering = AgglomerativeClustering(
- n_clusters=n_clusters, linkage="ward", connectivity=adjacencyMatrix
- )
- clusterOrder = clustering.fit_predict(candidates)
- # represent hierarchical aggregation with medoid
- clusterCenters, clusterCenterIndices = representations(
- repr_candidates,
- clusterOrder,
- default="medoidRepresentation",
- representationMethod=representationMethod,
- representationDict=representationDict,
- distributionPeriodWise=distributionPeriodWise,
- timeStepsPerPeriod=timeStepsPerPeriod,
- )
-
- return clusterCenters, clusterCenterIndices, clusterOrder
diff --git a/src/tsam/period_aggregation.py b/src/tsam/period_aggregation.py
new file mode 100644
index 00000000..b33ce66d
--- /dev/null
+++ b/src/tsam/period_aggregation.py
@@ -0,0 +1,151 @@
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+import numpy as np
+
+from tsam.representations import representations
+
+if TYPE_CHECKING:
+ from tsam.config import Distribution, MinMaxMean
+
+# Aliases: old verbose names → new short names.
+# The legacy wrapper sends old names; the pipeline sends new names.
+_METHOD_ALIASES = {
+ "k_means": "kmeans",
+ "k_medoids": "kmedoids",
+ "k_maxoids": "kmaxoids",
+ "adjacent_periods": "contiguous",
+}
+
+
+def aggregate_periods(
+ candidates: np.ndarray,
+ n_clusters: int = 8,
+ n_iter: int = 100,
+ cluster_method: str = "kmeans",
+ solver: str = "highs",
+ representation_method: str | Distribution | MinMaxMean | None = None,
+ representation_dict: dict[str, str] | None = None,
+ distribution_period_wise: bool = True,
+ n_timesteps_per_period: int | None = None,
+ representation_candidates: np.ndarray | None = None,
+) -> tuple[list[np.ndarray], list[int] | None, np.ndarray]:
+ """
+ Clusters the data based on one of the cluster methods:
+ 'averaging', 'kmeans', 'kmedoids' or 'hierarchical'.
+ """
+ # Normalize old names to new names
+ cluster_method = _METHOD_ALIASES.get(cluster_method, cluster_method)
+
+ # Use separate candidates for representation if provided
+ _rep_candidates = (
+ representation_candidates
+ if representation_candidates is not None
+ else candidates
+ )
+
+ # cluster the data
+ if cluster_method == "averaging":
+ n_sets = len(candidates)
+ cluster_size = n_sets // n_clusters
+ order_lists = [[n_cluster] * cluster_size for n_cluster in range(n_clusters)]
+ remainder = n_sets - cluster_size * n_clusters
+ if remainder > 0:
+ order_lists.append([n_clusters - 1] * remainder)
+ cluster_order = np.hstack(np.array(order_lists, dtype=object)) # type: ignore[call-overload]
+ cluster_centers, cluster_center_indices = representations(
+ _rep_candidates,
+ cluster_order,
+ default="mean",
+ representation_method=representation_method,
+ representation_dict=representation_dict,
+ distribution_period_wise=distribution_period_wise,
+ n_timesteps_per_period=n_timesteps_per_period,
+ )
+
+ elif cluster_method == "kmeans":
+ from sklearn.cluster import KMeans
+
+ k_means = KMeans(n_clusters=n_clusters, max_iter=1000, n_init=n_iter, tol=1e-4)
+
+ cluster_order = k_means.fit_predict(candidates)
+ # get with own mean representation to avoid numerical trouble caused by sklearn
+ cluster_centers, cluster_center_indices = representations(
+ _rep_candidates,
+ cluster_order,
+ default="mean",
+ representation_method=representation_method,
+ representation_dict=representation_dict,
+ distribution_period_wise=distribution_period_wise,
+ n_timesteps_per_period=n_timesteps_per_period,
+ )
+
+ elif cluster_method == "kmedoids":
+ from tsam.utils.k_medoids_exact import KMedoids
+
+ k_medoid = KMedoids(n_clusters=n_clusters, solver=solver)
+
+ cluster_order = k_medoid.fit_predict(candidates)
+ cluster_centers, cluster_center_indices = representations(
+ _rep_candidates,
+ cluster_order,
+ default="medoid",
+ representation_method=representation_method,
+ representation_dict=representation_dict,
+ distribution_period_wise=distribution_period_wise,
+ n_timesteps_per_period=n_timesteps_per_period,
+ )
+
+ elif cluster_method == "kmaxoids":
+ from tsam.utils.k_maxoids import KMaxoids
+
+ k_maxoid = KMaxoids(n_clusters=n_clusters)
+
+ cluster_order = k_maxoid.fit_predict(candidates)
+ cluster_centers, cluster_center_indices = representations(
+ _rep_candidates,
+ cluster_order,
+ default="maxoid",
+ representation_method=representation_method,
+ representation_dict=representation_dict,
+ distribution_period_wise=distribution_period_wise,
+ n_timesteps_per_period=n_timesteps_per_period,
+ )
+
+ elif cluster_method == "hierarchical" or cluster_method == "contiguous":
+ if n_clusters == 1:
+ cluster_order = np.asarray([0] * len(candidates))
+ else:
+ from sklearn.cluster import AgglomerativeClustering
+
+ if cluster_method == "hierarchical":
+ clustering = AgglomerativeClustering(
+ n_clusters=n_clusters, linkage="ward"
+ )
+ elif cluster_method == "contiguous":
+ adjacency_matrix = np.eye(len(candidates), k=1) + np.eye(
+ len(candidates), k=-1
+ )
+ clustering = AgglomerativeClustering(
+ n_clusters=n_clusters, linkage="ward", connectivity=adjacency_matrix
+ )
+ cluster_order = clustering.fit_predict(candidates)
+ # represent hierarchical aggregation with medoid
+ cluster_centers, cluster_center_indices = representations(
+ _rep_candidates,
+ cluster_order,
+ default="medoid",
+ representation_method=representation_method,
+ representation_dict=representation_dict,
+ distribution_period_wise=distribution_period_wise,
+ n_timesteps_per_period=n_timesteps_per_period,
+ )
+
+ else:
+ raise ValueError(
+ f"Unknown cluster_method '{cluster_method}'. "
+ f"Valid options: 'averaging', 'kmeans', 'kmedoids', 'kmaxoids', 'hierarchical', 'contiguous'."
+ )
+
+ return cluster_centers, cluster_center_indices, cluster_order
diff --git a/src/tsam/pipeline/__init__.py b/src/tsam/pipeline/__init__.py
new file mode 100644
index 00000000..b01f5ebf
--- /dev/null
+++ b/src/tsam/pipeline/__init__.py
@@ -0,0 +1,520 @@
+"""Pipeline package — pure-function rewrite of create_typical_periods."""
+
+from __future__ import annotations
+
+import time
+import warnings
+from typing import TYPE_CHECKING
+
+import numpy as np
+import pandas as pd
+
+from tsam.options import options
+from tsam.pipeline.accuracy import reconstruct
+from tsam.pipeline.clustering import (
+ cluster_periods,
+ cluster_sorted_periods,
+ use_predefined_assignments,
+)
+from tsam.pipeline.extremes import add_extreme_periods
+from tsam.pipeline.normalize import denormalize, normalize
+from tsam.pipeline.periods import add_period_sum_features, unstack_to_periods
+from tsam.pipeline.rescale import rescale_representatives
+from tsam.pipeline.segment import segment_typical_periods
+from tsam.pipeline.types import (
+ ClusteringOutput,
+ FormattedOutput,
+ PipelineConfig,
+ PipelineResult,
+ PredefParams, # noqa: F401 (re-exported)
+ PreparedData,
+)
+
+if TYPE_CHECKING:
+ from tsam.config import (
+ Distribution,
+ MinMaxMean,
+ )
+
+
+def _count_occurrences(cluster_order: np.ndarray) -> dict[int, float]:
+ """Count how many original periods each cluster represents.
+
+ Returns float values because partial-period adjustment (step 9)
+ can produce fractional counts downstream.
+ """
+ nums, counts = np.unique(cluster_order, return_counts=True)
+ return {int(num): float(counts[ii]) for ii, num in enumerate(nums)}
+
+
+def _representatives_to_dataframe(
+ cluster_periods_list: list[np.ndarray],
+ column_index: pd.MultiIndex,
+) -> pd.DataFrame:
+ """Reshape flat cluster period vectors into a MultiIndex DataFrame.
+
+ Converts a list of 1-D arrays (one per cluster) into a DataFrame
+ indexed by (PeriodNum, TimeStep) with the original column names.
+ """
+ df = (
+ pd.concat(
+ [pd.Series(s, index=column_index) for s in cluster_periods_list],
+ axis=1,
+ )
+ .unstack("TimeStep")
+ .T
+ )
+ assert isinstance(df, pd.DataFrame)
+ return df
+
+
+def _warn_if_out_of_bounds(
+ typical_periods: pd.DataFrame,
+ original_data: pd.DataFrame,
+ tolerance: float,
+) -> None:
+ """Warn if aggregated values exceed original data bounds."""
+ exceeds_max = typical_periods.max(axis=0) > original_data.max(axis=0)
+ if exceeds_max.any():
+ diff = typical_periods.max(axis=0) - original_data.max(axis=0)
+ exceeding_diff = diff[exceeds_max]
+ if exceeding_diff.max() > tolerance:
+ warnings.warn(
+ "At least one maximal value of the "
+ + "aggregated time series exceeds the maximal value "
+ + "the input time series for: "
+ + f"{exceeding_diff.to_dict()}"
+ + ". To silence the warning set the 'numerical_tolerance' to a higher value."
+ )
+ below_min = typical_periods.min(axis=0) < original_data.min(axis=0)
+ if below_min.any():
+ diff = original_data.min(axis=0) - typical_periods.min(axis=0)
+ exceeding_diff = diff[below_min]
+ if exceeding_diff.max() > tolerance:
+ warnings.warn(
+ "Something went wrong... At least one minimal value of the "
+ + "aggregated time series exceeds the minimal value "
+ + "the input time series for: "
+ + f"{exceeding_diff.to_dict()}"
+ + ". To silence the warning set the 'numerical_tolerance' to a higher value."
+ )
+
+
+def _apply_weights_df(
+ df: pd.DataFrame, weights: dict[str, float] | None
+) -> pd.DataFrame:
+ """Multiply DataFrame columns by weights for segmentation.
+
+ Segmentation boundaries are determined in weighted space so that
+ high-weight columns have more influence on where segments fall.
+ """
+ if not weights:
+ return df
+ out = df.copy()
+ for col, w in weights.items():
+ if col in out.columns:
+ out[col] *= w
+ return pd.DataFrame(out)
+
+
+def _remove_weights_df(
+ df: pd.DataFrame, weights: dict[str, float] | None
+) -> pd.DataFrame:
+ """Divide out weights after segmentation to restore unweighted values."""
+ if not weights:
+ return df
+ out = df.copy()
+ for col, w in weights.items():
+ if col in out.columns:
+ out[col] /= w
+ return pd.DataFrame(out)
+
+
+def _build_weight_vector(
+ columns: pd.Index,
+ weights: dict[str, float] | None,
+) -> np.ndarray | None:
+ """Build a weight array aligned to *columns*, defaulting unlisted columns to 1.0.
+
+ Returns ``None`` if all weights are 1.0 (no weighting needed).
+ """
+ if not weights:
+ return None
+ result: list[float] = []
+ any_non_unit = False
+ for col in columns:
+ w = weights.get(col, 1.0)
+ if w < options.min_weight:
+ warnings.warn(
+ f'weight of "{col}" set to the minimal tolerable weighting',
+ stacklevel=2,
+ )
+ w = options.min_weight
+ if w != 1.0:
+ any_non_unit = True
+ result.append(w)
+ return np.array(result) if any_non_unit else None
+
+
+def _build_representation_dict(
+ columns: pd.Index,
+ cluster_representation: str | Distribution | MinMaxMean | None,
+) -> dict[str, str]:
+ """Build the representation dict (mean/min/max per column) from config."""
+ from tsam.config import MinMaxMean
+
+ representation_dict: dict[str, str] = dict.fromkeys(columns, "mean")
+ if isinstance(cluster_representation, MinMaxMean):
+ for col in cluster_representation.max_columns:
+ if col in representation_dict:
+ representation_dict[col] = "max"
+ for col in cluster_representation.min_columns:
+ if col in representation_dict:
+ representation_dict[col] = "min"
+ return representation_dict
+
+
+def _prepare_data(
+ data: pd.DataFrame,
+ cfg: PipelineConfig,
+) -> PreparedData:
+ """Phase 1: Build representation dict, normalize, unstack, weight (steps 1-3)."""
+ cluster = cfg.cluster
+ cluster_representation = cluster.get_representation()
+ representation_dict = _build_representation_dict(
+ data.columns, cluster_representation
+ )
+ original_column_order = list(data.columns)
+ original_data = data.copy()
+
+ # Step 1: Normalize
+ norm_data = normalize(data, cluster.scale_by_column_means)
+
+ # Step 2: Unstack to periods
+ period_profiles = unstack_to_periods(norm_data.values, cfg.n_timesteps_per_period)
+ candidates = period_profiles.profiles_dataframe.values
+
+ # Step 2b: Apply weights directly to candidates
+ weight_vector = _build_weight_vector(norm_data.values.columns, cluster.weights)
+ weighted_profiles_df: pd.DataFrame | None = None
+ if weight_vector is not None:
+ weight_tile = np.repeat(weight_vector, period_profiles.n_timesteps_per_period)
+ candidates = candidates * weight_tile
+ # Keep a weighted DataFrame for extremes/segmentation (need column labels).
+ wpdf = period_profiles.profiles_dataframe.copy()
+ for col_name, w in zip(
+ wpdf.columns.get_level_values(0).unique(),
+ weight_vector,
+ ):
+ wpdf[col_name] *= w
+ weighted_profiles_df = wpdf
+
+ # Step 3: Add period sum features if requested
+ # Period sums are extra columns appended for clustering distance only;
+ # they must NOT reach representations() which expects original columns.
+ n_feature_cols = candidates.shape[1]
+ if cluster.include_period_sums:
+ candidates, _n_extra = add_period_sum_features(
+ period_profiles.profiles_dataframe, candidates
+ )
+
+ return PreparedData(
+ norm_data=norm_data,
+ period_profiles=period_profiles,
+ candidates=candidates,
+ representation_dict=representation_dict,
+ n_feature_cols=n_feature_cols,
+ original_column_order=original_column_order,
+ original_data=original_data,
+ weight_vector=weight_vector,
+ weighted_profiles_df=weighted_profiles_df,
+ )
+
+
+def _cluster_and_postprocess(
+ prepared: PreparedData,
+ cfg: PipelineConfig,
+ data_length: int,
+) -> ClusteringOutput:
+ """Phase 2: Cluster, trim, extremes, counts, rescale, partial (steps 4-9)."""
+ cluster = cfg.cluster
+ cluster_representation = cluster.get_representation()
+ candidates = prepared.candidates
+ period_profiles = prepared.period_profiles
+
+ # Step 4: Cluster
+ clustering_duration = 0.0
+ cluster_center_indices: list[int] | None = None
+
+ if cfg.predef is not None:
+ cluster_centers, cluster_center_indices, cluster_order = (
+ use_predefined_assignments(
+ candidates,
+ cfg.predef,
+ cluster_representation,
+ prepared.representation_dict,
+ cfg.n_timesteps_per_period,
+ )
+ )
+ else:
+ t_start = time.time()
+ # When period-sum features are appended, representations must run
+ # on the non-augmented prefix so period-sum columns don't leak in.
+ rep_candidates: np.ndarray | None = None
+ if candidates.shape[1] != prepared.n_feature_cols:
+ rep_candidates = candidates[:, : prepared.n_feature_cols]
+
+ if not cluster.use_duration_curves:
+ cluster_centers, cluster_center_indices, cluster_order = cluster_periods(
+ candidates,
+ cfg.n_clusters,
+ cluster,
+ prepared.representation_dict,
+ cfg.n_timesteps_per_period,
+ representation_candidates=rep_candidates,
+ )
+ else:
+ cluster_centers, cluster_center_indices, cluster_order = (
+ cluster_sorted_periods(
+ candidates,
+ period_profiles.n_columns,
+ cfg.n_clusters,
+ cluster,
+ prepared.representation_dict,
+ cfg.n_timesteps_per_period,
+ )
+ )
+ clustering_duration = time.time() - t_start
+
+ # Ensure cluster_order is always np.ndarray
+ cluster_order = np.asarray(cluster_order)
+
+ # Step 5: Trim eval features from representatives (still weighted)
+ cluster_periods_list: list[np.ndarray] = [
+ center[: prepared.n_feature_cols] for center in cluster_centers
+ ]
+
+ # Step 6: Add extreme periods if configured
+ # Extremes run in weighted space (matching develop): weighted profiles
+ # determine which period is extreme, and extracted profiles carry weights.
+ # Unweighting happens after, so all centers are treated uniformly.
+ extreme_periods_info: dict[str, dict] = {}
+ extreme_cluster_idx: list[int] = []
+
+ if cfg.extremes is not None:
+ profiles_for_extremes = (
+ prepared.weighted_profiles_df
+ if prepared.weighted_profiles_df is not None
+ else period_profiles.profiles_dataframe
+ )
+ (
+ cluster_periods_list,
+ cluster_order,
+ extreme_cluster_idx,
+ extreme_periods_info,
+ ) = add_extreme_periods(
+ profiles_for_extremes,
+ cluster_periods_list,
+ cluster_order,
+ cfg.extremes,
+ )
+ cluster_order = np.asarray(cluster_order)
+ else:
+ if cfg.predef is not None and cfg.predef.extreme_cluster_idx is not None:
+ extreme_cluster_idx = list(cfg.predef.extreme_cluster_idx)
+
+ # Unweight all representatives (regular + extreme) — remove weights
+ # before downstream steps (rescale, denorm) which expect unweighted data.
+ if prepared.weight_vector is not None:
+ inv_tile = np.repeat(1.0 / prepared.weight_vector, cfg.n_timesteps_per_period)
+ cluster_periods_list = [center * inv_tile for center in cluster_periods_list]
+
+ # Step 7: Compute cluster counts
+ cluster_counts = _count_occurrences(cluster_order)
+
+ # Step 8: Rescale if requested
+ rescale_deviations: dict[str, dict] = {}
+ rescale_exclude = cfg.rescale_exclude_columns or []
+ if cfg.rescale_cluster_periods:
+ cluster_periods_list, rescale_deviations = rescale_representatives( # type: ignore[assignment]
+ cluster_periods_list,
+ cluster_counts,
+ extreme_cluster_idx,
+ period_profiles.profiles_dataframe,
+ prepared.original_data,
+ prepared.norm_data.scale_by_column_means,
+ cfg.n_timesteps_per_period,
+ rescale_exclude,
+ )
+ cluster_periods_list = list(cluster_periods_list)
+
+ # Step 9: Adjust for partial periods
+ if data_length % cfg.n_timesteps_per_period != 0:
+ last_cluster = int(cluster_order[-1])
+ cluster_counts[last_cluster] -= (
+ 1
+ - float(data_length % cfg.n_timesteps_per_period)
+ / cfg.n_timesteps_per_period
+ )
+
+ return ClusteringOutput(
+ cluster_periods_list=cluster_periods_list,
+ cluster_order=cluster_order,
+ cluster_counts=cluster_counts,
+ cluster_center_indices=cluster_center_indices,
+ extreme_cluster_idx=extreme_cluster_idx,
+ extreme_periods_info=extreme_periods_info,
+ clustering_duration=clustering_duration,
+ rescale_deviations=rescale_deviations,
+ )
+
+
+def _format_and_reconstruct(
+ prepared: PreparedData,
+ clustered: ClusteringOutput,
+ cfg: PipelineConfig,
+) -> FormattedOutput:
+ """Phase 3: Format, segment, denorm, bounds, reconstruct + accuracy (steps 10-14)."""
+ norm_data = prepared.norm_data
+ period_profiles = prepared.period_profiles
+
+ # Step 10: Format representatives to MultiIndex DataFrame
+ normalized_typical_periods = _representatives_to_dataframe(
+ clustered.cluster_periods_list, period_profiles.column_index
+ )
+
+ # Step 11: Segmentation if configured
+ segmented_df = None
+ predicted_segmented_df = None
+ segment_center_indices = None
+
+ if cfg.segments is not None:
+ # Segmentation runs in weighted space so that high-weight columns
+ # have more influence on segment boundaries. Weights are removed
+ # from the output before denormalization.
+ weights = cfg.cluster.weights
+ segmentation_input = _apply_weights_df(normalized_typical_periods, weights)
+ segmented_df, predicted_segmented_df, segment_center_indices = (
+ segment_typical_periods(
+ segmentation_input,
+ cfg.n_timesteps_per_period,
+ cfg.segments,
+ prepared.representation_dict,
+ cfg.predef,
+ )
+ )
+ segmented_df = _remove_weights_df(segmented_df, weights)
+ predicted_segmented_df = _remove_weights_df(predicted_segmented_df, weights)
+ segmented_normalized = segmented_df.reset_index(level=3, drop=True)
+ denorm_source = segmented_normalized
+ reconstruct_source = predicted_segmented_df
+ else:
+ denorm_source = normalized_typical_periods
+ reconstruct_source = normalized_typical_periods
+
+ # Step 12: Denormalize -> typical_periods
+ typical_periods = denormalize(denorm_source, norm_data)
+ if cfg.round_decimals is not None:
+ typical_periods = typical_periods.round(decimals=cfg.round_decimals)
+
+ # Step 13: Bounds check + warnings
+ _warn_if_out_of_bounds(
+ typical_periods, prepared.original_data, cfg.numerical_tolerance
+ )
+
+ # Step 14: Reconstruct + compute accuracy
+ reconstructed_data, normalized_predicted = reconstruct(
+ reconstruct_source,
+ clustered.cluster_order,
+ period_profiles,
+ norm_data,
+ prepared.original_data,
+ )
+ if cfg.round_decimals is not None:
+ reconstructed_data = reconstructed_data.round(decimals=cfg.round_decimals)
+
+ # Restore original column order
+ typical_periods = typical_periods[prepared.original_column_order]
+ reconstructed_data = reconstructed_data[prepared.original_column_order]
+
+ return FormattedOutput(
+ typical_periods=typical_periods,
+ reconstructed_data=reconstructed_data,
+ normalized_predicted=normalized_predicted,
+ segmented_df=segmented_df,
+ segment_center_indices=segment_center_indices,
+ )
+
+
+def _assemble_result(
+ prepared: PreparedData,
+ clustered: ClusteringOutput,
+ formatted: FormattedOutput,
+ cfg: PipelineConfig,
+) -> PipelineResult:
+ """Phase 4: Build ClusteringResult + PipelineResult (steps 15-16)."""
+ from tsam.config import ClusteringResult as _ClusteringResult
+
+ original_data_out = prepared.original_data[prepared.original_column_order]
+
+ clustering_result = _ClusteringResult.from_pipeline(
+ cluster_center_indices=clustered.cluster_center_indices,
+ extreme_periods_info=clustered.extreme_periods_info,
+ extremes_config=cfg.extremes,
+ cluster_order=clustered.cluster_order,
+ segmented_df=formatted.segmented_df,
+ segment_center_indices=formatted.segment_center_indices,
+ n_timesteps_per_period=cfg.n_timesteps_per_period,
+ temporal_resolution=cfg.temporal_resolution,
+ original_data=original_data_out,
+ cluster_config=cfg.cluster,
+ segment_config=cfg.segments,
+ rescale_cluster_periods=cfg.rescale_cluster_periods,
+ rescale_exclude_columns=cfg.rescale_exclude_columns or [],
+ extreme_cluster_idx=clustered.extreme_cluster_idx,
+ )
+
+ return PipelineResult(
+ typical_periods=formatted.typical_periods,
+ cluster_counts=clustered.cluster_counts,
+ n_timesteps_per_period=cfg.n_timesteps_per_period,
+ time_index=prepared.period_profiles.time_index,
+ original_data=original_data_out,
+ clustering_duration=clustered.clustering_duration,
+ rescale_deviations=clustered.rescale_deviations,
+ segmented_df=formatted.segmented_df,
+ reconstructed_data=formatted.reconstructed_data,
+ _norm_values=prepared.norm_data.values,
+ _normalized_predicted=formatted.normalized_predicted,
+ clustering_result=clustering_result,
+ )
+
+
+def run_pipeline(
+ data: pd.DataFrame,
+ cfg: PipelineConfig,
+) -> PipelineResult:
+ """Run the full aggregation pipeline.
+
+ This replaces create_typical_periods() + predict_original_data() + accuracy_indicators().
+ """
+ prepared = _prepare_data(data, cfg)
+
+ clustered = _cluster_and_postprocess(
+ prepared,
+ cfg,
+ data_length=len(data),
+ )
+
+ formatted = _format_and_reconstruct(
+ prepared,
+ clustered,
+ cfg,
+ )
+
+ return _assemble_result(
+ prepared,
+ clustered,
+ formatted,
+ cfg,
+ )
diff --git a/src/tsam/pipeline/accuracy.py b/src/tsam/pipeline/accuracy.py
new file mode 100644
index 00000000..d71fb0db
--- /dev/null
+++ b/src/tsam/pipeline/accuracy.py
@@ -0,0 +1,80 @@
+"""Reconstruction and accuracy computation."""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+import numpy as np
+import pandas as pd
+from sklearn.metrics import mean_absolute_error, mean_squared_error
+
+from tsam.pipeline.normalize import denormalize
+
+if TYPE_CHECKING:
+ from tsam.pipeline.types import NormalizedData, PeriodProfiles
+
+
+def reconstruct(
+ typical_periods: pd.DataFrame,
+ cluster_order: np.ndarray,
+ period_profiles: PeriodProfiles,
+ norm_data: NormalizedData,
+ original_data: pd.DataFrame,
+) -> tuple[pd.DataFrame, pd.DataFrame]:
+ """Expand typical periods via assignments, denormalize.
+
+ Returns (denormalized_predicted, normalized_predicted).
+ """
+ # Unstack once, then use vectorized indexing to select periods by cluster order
+ typical_unstacked = typical_periods.unstack()
+ reconstructed = typical_unstacked.loc[list(cluster_order)].values
+
+ # Back in matrix form
+ clustered_data_df = pd.DataFrame(
+ reconstructed,
+ columns=period_profiles.column_index,
+ index=period_profiles.profiles_dataframe.index,
+ )
+ clustered_data_df = clustered_data_df.stack(future_stack=True, level="TimeStep") # type: ignore[assignment]
+
+ # Trim to original data length
+ original_len = len(original_data)
+ normalized_predicted = pd.DataFrame(
+ clustered_data_df.values[:original_len],
+ index=original_data.index,
+ columns=original_data.columns,
+ )
+
+ denormalized = denormalize(normalized_predicted, norm_data)
+
+ return denormalized, normalized_predicted
+
+
+def compute_accuracy(
+ normalized_original: pd.DataFrame,
+ normalized_predicted: pd.DataFrame,
+) -> pd.DataFrame:
+ """Compute RMSE, MAE, duration RMSE per column.
+
+ Both inputs are unweighted normalized data — direct comparison.
+ """
+ indicator_raw: dict[str, dict] = {
+ "RMSE": {},
+ "RMSE_duration": {},
+ "MAE": {},
+ }
+
+ for column in normalized_original.columns:
+ orig_ts = normalized_original[column]
+ pred_ts = normalized_predicted[column]
+
+ indicator_raw["RMSE"][column] = np.sqrt(mean_squared_error(orig_ts, pred_ts))
+ indicator_raw["RMSE_duration"][column] = np.sqrt(
+ mean_squared_error(
+ orig_ts.sort_values(ascending=False).reset_index(drop=True),
+ pred_ts.sort_values(ascending=False).reset_index(drop=True),
+ )
+ )
+ indicator_raw["MAE"][column] = mean_absolute_error(orig_ts, pred_ts)
+
+ return pd.DataFrame(indicator_raw)
diff --git a/src/tsam/pipeline/clustering.py b/src/tsam/pipeline/clustering.py
new file mode 100644
index 00000000..8e9aaca8
--- /dev/null
+++ b/src/tsam/pipeline/clustering.py
@@ -0,0 +1,127 @@
+"""Clustering wrappers around period_aggregation and representations."""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+import numpy as np
+
+from tsam.period_aggregation import aggregate_periods
+from tsam.representations import representations
+
+if TYPE_CHECKING:
+ from tsam.config import ClusterConfig, Distribution, MinMaxMean
+ from tsam.pipeline.types import PredefParams
+
+
+def cluster_periods(
+ candidates: np.ndarray,
+ n_clusters: int,
+ cluster: ClusterConfig,
+ representation_dict: dict | None,
+ n_timesteps_per_period: int,
+ representation_candidates: np.ndarray | None,
+) -> tuple[list[np.ndarray], list[int] | None, np.ndarray]:
+ """Run clustering via aggregate_periods.
+
+ Candidates are already weighted (if weights exist).
+
+ If *representation_candidates* is provided, representations are computed
+ from those columns instead of from *candidates* (used when period-sum
+ features are appended for clustering distance only).
+
+ Returns (cluster_centers, cluster_center_indices, cluster_order).
+ """
+ centers, center_indices, order = aggregate_periods(
+ candidates,
+ n_clusters=n_clusters,
+ n_iter=100,
+ solver=cluster.solver,
+ cluster_method=cluster.method,
+ representation_method=cluster.get_representation(),
+ representation_dict=representation_dict,
+ n_timesteps_per_period=n_timesteps_per_period,
+ representation_candidates=representation_candidates,
+ )
+ return centers, center_indices, order
+
+
+def cluster_sorted_periods(
+ candidates: np.ndarray,
+ n_columns: int,
+ n_clusters: int,
+ cluster: ClusterConfig,
+ representation_dict: dict | None,
+ n_timesteps_per_period: int,
+) -> tuple[list[np.ndarray], list[int] | None, np.ndarray]:
+ """Duration-curve clustering: sort descending, cluster, pick medoid from original.
+
+ Candidates are already weighted (if weights exist). Medoids are picked
+ from the (weighted) unsorted candidates.
+
+ Returns (cluster_centers, cluster_center_indices, cluster_order).
+ """
+
+ # Sort each period's timesteps descending for all columns.
+ # Use candidates (already weighted) so that clustering distance respects
+ # column weights — matching v3 behaviour.
+ n_periods, n_total = candidates.shape
+ n_timesteps = n_total // n_columns
+
+ values_3d = candidates.copy().reshape(n_periods, n_columns, n_timesteps)
+ sorted_values = (-np.sort(-values_3d, axis=2, kind="stable")).reshape(n_periods, -1)
+
+ _, center_indices, cluster_order = aggregate_periods(
+ sorted_values,
+ n_clusters=n_clusters,
+ n_iter=30,
+ solver=cluster.solver,
+ cluster_method=cluster.method,
+ representation_method=cluster.get_representation(),
+ representation_dict=representation_dict,
+ n_timesteps_per_period=n_timesteps_per_period,
+ )
+
+ # Pick medoid from unsorted candidates (already weighted).
+ cluster_centers = []
+ for cluster_num in np.unique(cluster_order):
+ indices = np.where(cluster_order == cluster_num)[0]
+ if len(indices) > 1:
+ current_mean = sorted_values[indices].mean(axis=0)
+ mindist_idx = np.argmin(
+ np.square(sorted_values[indices] - current_mean).sum(axis=1)
+ )
+ cluster_centers.append(candidates[indices][mindist_idx])
+ else:
+ cluster_centers.append(candidates[indices][0])
+
+ return cluster_centers, center_indices, cluster_order
+
+
+def use_predefined_assignments(
+ candidates: np.ndarray,
+ predef: PredefParams,
+ representation_method: str | Distribution | MinMaxMean | None,
+ representation_dict: dict | None,
+ n_timesteps_per_period: int,
+) -> tuple[list[np.ndarray] | np.ndarray, list[int] | None, list | np.ndarray]:
+ """Skip clustering, compute representatives from predefined assignments.
+
+ Returns (cluster_centers, cluster_center_indices, cluster_order).
+ """
+ if predef.cluster_center_indices is not None:
+ return (
+ candidates[predef.cluster_center_indices],
+ list(predef.cluster_center_indices),
+ predef.cluster_order,
+ )
+ else:
+ centers, computed_indices = representations(
+ candidates,
+ predef.cluster_order, # type: ignore[arg-type]
+ default="medoid",
+ representation_method=representation_method,
+ representation_dict=representation_dict,
+ n_timesteps_per_period=n_timesteps_per_period,
+ )
+ return centers, computed_indices, predef.cluster_order
diff --git a/src/tsam/pipeline/extremes.py b/src/tsam/pipeline/extremes.py
new file mode 100644
index 00000000..7e5b2014
--- /dev/null
+++ b/src/tsam/pipeline/extremes.py
@@ -0,0 +1,168 @@
+"""Extreme period counting and integration."""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+ import numpy as np
+ import pandas as pd
+
+ from tsam.config import ExtremeConfig
+
+
+def _append_col_with(column, append_with: str = " max."):
+ """Append a string to the column name. For MultiIndexes, only last level is changed."""
+ if isinstance(column, str):
+ return column + append_with
+ elif isinstance(column, tuple):
+ col = list(column)
+ col[-1] = col[-1] + append_with
+ return tuple(col)
+
+
+def _detect_extreme(
+ profiles_df: pd.DataFrame,
+ column,
+ series: pd.Series,
+ suffix: str,
+ extreme_period_no: list,
+ cc_list: list,
+ extreme_periods: dict,
+) -> None:
+ """Detect a single extreme period and add it to extreme_periods if unique."""
+ step_no = series # already reduced to a scalar index
+ if (
+ step_no not in extreme_period_no
+ and profiles_df.loc[step_no, :].values.tolist() not in cc_list
+ ):
+ key = _append_col_with(column, suffix)
+ extreme_periods[key] = {
+ "step_no": step_no,
+ "profile": profiles_df.loc[step_no, :].values,
+ "column": column,
+ }
+ extreme_period_no.append(step_no)
+
+
+def add_extreme_periods(
+ profiles_df: pd.DataFrame,
+ cluster_centers: list,
+ cluster_order: list | np.ndarray,
+ extremes: ExtremeConfig,
+) -> tuple[list, list | np.ndarray, list[int], dict]:
+ """Add extreme periods to clustered data.
+
+ Returns (new_cluster_centers, new_cluster_order, extreme_cluster_idx, extreme_periods_info).
+ """
+ columns = profiles_df.columns.get_level_values(0).unique().tolist()
+ extreme_periods: dict = {}
+ extreme_period_no: list = []
+
+ cc_list = [center.tolist() for center in cluster_centers]
+
+ # Detect extreme periods for each column
+ _CHECKS: list[tuple[list[str], str, str]] = [
+ (extremes.max_value, "max", " max."),
+ (extremes.min_value, "min", " min."),
+ (extremes.max_period, "mean_max", " daily max."),
+ (extremes.min_period, "mean_min", " daily min."),
+ ]
+ for column in columns:
+ for config_list, kind, suffix in _CHECKS:
+ if column not in config_list:
+ continue
+ if kind == "max":
+ step_no = profiles_df[column].max(axis=1).idxmax() # type: ignore[arg-type]
+ elif kind == "min":
+ step_no = profiles_df[column].min(axis=1).idxmin() # type: ignore[arg-type]
+ elif kind == "mean_max":
+ step_no = profiles_df[column].mean(axis=1).idxmax() # type: ignore[call-overload]
+ else: # mean_min
+ step_no = profiles_df[column].mean(axis=1).idxmin() # type: ignore[call-overload]
+ _detect_extreme(
+ profiles_df,
+ column,
+ step_no,
+ suffix,
+ extreme_period_no,
+ cc_list,
+ extreme_periods,
+ )
+
+ # Get current related clusters of extreme periods
+ for period_type in extreme_periods:
+ extreme_periods[period_type]["cluster_no"] = cluster_order[
+ extreme_periods[period_type]["step_no"]
+ ]
+
+ new_cluster_centers: list = []
+ new_cluster_order = list(cluster_order)
+ extreme_cluster_idx: list[int] = []
+
+ if extremes.method == "append":
+ for cluster_center in cluster_centers:
+ new_cluster_centers.append(cluster_center)
+ for i, period_type in enumerate(extreme_periods):
+ extreme_cluster_idx.append(len(new_cluster_centers))
+ new_cluster_centers.append(extreme_periods[period_type]["profile"])
+ new_cluster_order[extreme_periods[period_type]["step_no"]] = i + len(
+ cluster_centers
+ )
+
+ elif extremes.method == "new_cluster":
+ for cluster_center in cluster_centers:
+ new_cluster_centers.append(cluster_center)
+ for i, period_type in enumerate(extreme_periods):
+ extreme_cluster_idx.append(len(new_cluster_centers))
+ new_cluster_centers.append(extreme_periods[period_type]["profile"])
+ extreme_periods[period_type]["new_cluster_no"] = i + len(cluster_centers)
+
+ # Build set of extreme period step numbers for quick lookup
+ extreme_step_nos = {extreme_periods[pt]["step_no"] for pt in extreme_periods}
+
+ for i, c_period in enumerate(new_cluster_order):
+ # Skip periods that are themselves an extreme period for a different type
+ if i in extreme_step_nos:
+ # Only reassign if this period IS the extreme for exactly one type
+ own_types = [
+ pt for pt in extreme_periods if extreme_periods[pt]["step_no"] == i
+ ]
+ if len(own_types) == 1:
+ new_cluster_order[i] = extreme_periods[own_types[0]][
+ "new_cluster_no"
+ ]
+ continue
+
+ cluster_dist = sum(
+ (profiles_df.iloc[i].values - cluster_centers[c_period]) ** 2
+ )
+ # Find the closest extreme period (deterministic: first match with smallest distance)
+ best_extreme = None
+ best_dist = cluster_dist
+ for extrem_period_type in extreme_periods:
+ extperiod_dist = sum(
+ (
+ profiles_df.iloc[i].values
+ - extreme_periods[extrem_period_type]["profile"]
+ )
+ ** 2
+ )
+ if extperiod_dist < best_dist:
+ best_dist = extperiod_dist
+ best_extreme = extrem_period_type
+
+ if best_extreme is not None:
+ new_cluster_order[i] = extreme_periods[best_extreme]["new_cluster_no"]
+
+ elif extremes.method == "replace":
+ new_cluster_centers = list(cluster_centers)
+ for period_type in extreme_periods:
+ index = profiles_df.columns.get_loc(extreme_periods[period_type]["column"])
+ new_cluster_centers[extreme_periods[period_type]["cluster_no"]][index] = (
+ extreme_periods[period_type]["profile"][index]
+ )
+ if extreme_periods[period_type]["cluster_no"] not in extreme_cluster_idx:
+ extreme_cluster_idx.append(extreme_periods[period_type]["cluster_no"])
+
+ return new_cluster_centers, new_cluster_order, extreme_cluster_idx, extreme_periods
diff --git a/src/tsam/pipeline/normalize.py b/src/tsam/pipeline/normalize.py
new file mode 100644
index 00000000..42f23acb
--- /dev/null
+++ b/src/tsam/pipeline/normalize.py
@@ -0,0 +1,63 @@
+"""Normalization and denormalization of time series data."""
+
+from __future__ import annotations
+
+import pandas as pd
+from sklearn.preprocessing import MinMaxScaler
+
+from tsam.pipeline.types import NormalizedData
+
+
+def normalize(
+ data: pd.DataFrame,
+ scale_by_column_means: bool,
+) -> NormalizedData:
+ """Cast float, fit MinMaxScaler, normalize, optionally divide by column means.
+
+ Weights are NOT applied here — they are used only for clustering distance.
+ """
+ data = data.astype(float)
+
+ # Fit MinMaxScaler and normalize
+ scaler = MinMaxScaler()
+ normalized = pd.DataFrame(
+ scaler.fit_transform(data),
+ columns=data.columns,
+ index=data.index,
+ )
+
+ # Store mean before scale_by_column_means division
+ normalized_mean = normalized.mean()
+
+ if scale_by_column_means:
+ normalized = normalized / normalized_mean
+
+ return NormalizedData(
+ values=normalized,
+ scaler=scaler,
+ normalized_mean=normalized_mean,
+ scale_by_column_means=scale_by_column_means,
+ )
+
+
+def denormalize(
+ df: pd.DataFrame,
+ norm_data: NormalizedData,
+) -> pd.DataFrame:
+ """Undo normalization using stored scaler.
+
+ No weight logic — weights are only used for clustering distance.
+ """
+ result = df.copy()
+
+ if norm_data.scale_by_column_means:
+ result = result * norm_data.normalized_mean
+
+ # Inverse transform using stored scaler
+ unnormalized = pd.DataFrame(
+ norm_data.scaler.inverse_transform(result),
+ columns=result.columns,
+ index=result.index,
+ )
+
+ return unnormalized
diff --git a/src/tsam/pipeline/periods.py b/src/tsam/pipeline/periods.py
new file mode 100644
index 00000000..a2e039b7
--- /dev/null
+++ b/src/tsam/pipeline/periods.py
@@ -0,0 +1,83 @@
+"""Period unstacking and feature augmentation."""
+
+from __future__ import annotations
+
+import copy
+
+import numpy as np
+import pandas as pd
+
+from tsam.pipeline.types import PeriodProfiles
+
+
+def unstack_to_periods(
+ normalized_ts: pd.DataFrame,
+ n_timesteps_per_period: int,
+) -> PeriodProfiles:
+ """Extend to integer multiple of period length, reshape to period matrix."""
+ unstacked = normalized_ts.copy()
+
+ # Extend to integer multiple of period length
+ if len(normalized_ts) % n_timesteps_per_period == 0:
+ pass
+ else:
+ attached_timesteps = (
+ n_timesteps_per_period - len(normalized_ts) % n_timesteps_per_period
+ )
+ rep_data = unstacked.head(attached_timesteps)
+ unstacked = pd.concat([unstacked, rep_data])
+
+ # Create period and step index
+ period_index = []
+ step_index = []
+ for ii in range(len(unstacked)):
+ period_index.append(int(ii / n_timesteps_per_period))
+ step_index.append(
+ ii - int(ii / n_timesteps_per_period) * n_timesteps_per_period
+ )
+
+ # Save old index
+ time_index = copy.deepcopy(unstacked.index)
+
+ # Create new double index and unstack
+ unstacked.index = pd.MultiIndex.from_arrays(
+ [step_index, period_index], names=["TimeStep", "PeriodNum"]
+ )
+ unstacked = unstacked.unstack(level="TimeStep") # type: ignore[assignment]
+
+ # Check for NaN
+ if unstacked.isnull().values.any():
+ raise ValueError(
+ "Pre processed data includes NaN. Please check the time_series input data."
+ )
+
+ n_periods = len(unstacked)
+ n_columns = len(normalized_ts.columns)
+
+ return PeriodProfiles(
+ column_index=unstacked.columns, # type: ignore[arg-type]
+ time_index=time_index,
+ profiles_dataframe=unstacked,
+ n_timesteps_per_period=n_timesteps_per_period,
+ n_columns=n_columns,
+ n_periods=n_periods,
+ )
+
+
+def add_period_sum_features(
+ profiles_df: pd.DataFrame,
+ candidates: np.ndarray,
+) -> tuple[np.ndarray, int]:
+ """Append per-column sums as extra features.
+
+ Returns (augmented_candidates, n_extra_features_to_trim).
+ """
+ evaluation_values = (
+ profiles_df.stack(future_stack=True, level=0).sum(axis=1).unstack(level=1) # type: ignore[arg-type]
+ )
+ n_extra = len(evaluation_values.columns)
+ augmented = np.concatenate(
+ (candidates, evaluation_values.values),
+ axis=1,
+ )
+ return augmented, n_extra
diff --git a/src/tsam/pipeline/rescale.py b/src/tsam/pipeline/rescale.py
new file mode 100644
index 00000000..9b7ea44c
--- /dev/null
+++ b/src/tsam/pipeline/rescale.py
@@ -0,0 +1,112 @@
+"""Rescaling of cluster periods to match original means."""
+
+from __future__ import annotations
+
+import warnings
+
+import numpy as np
+import pandas as pd
+
+from tsam.options import options
+
+
+def rescale_representatives(
+ cluster_periods: list | np.ndarray,
+ cluster_period_no_occur: dict[int, float],
+ extreme_cluster_idx: list[int],
+ profiles_df: pd.DataFrame,
+ original_data: pd.DataFrame,
+ normalize_column_means: bool,
+ n_timesteps_per_period: int,
+ exclude_columns: list[str],
+) -> tuple[np.ndarray, dict]:
+ """Rescale cluster periods so weighted mean matches original.
+
+ Returns (rescaled_periods, deviations_dict).
+ """
+ columns = list(original_data.columns)
+
+ rescale_deviations: dict = {}
+
+ weighting_vec = pd.Series(cluster_period_no_occur).values
+ n_clusters = len(cluster_periods)
+ n_cols = len(columns)
+ n_timesteps = n_timesteps_per_period
+
+ # Convert to 3D numpy array: (n_clusters, n_cols, n_timesteps)
+ arr = np.array(cluster_periods).reshape(n_clusters, n_cols, n_timesteps)
+
+ # Indices for non-extreme clusters
+ idx_wo_peak = np.delete(np.arange(n_clusters), extreme_cluster_idx)
+ extreme_cluster_idx_arr = np.array(extreme_cluster_idx, dtype=int)
+
+ for ci, column in enumerate(columns):
+ if column in exclude_columns:
+ continue
+
+ col_data = arr[:, ci, :] # (n_clusters, n_timesteps)
+ sum_raw = profiles_df[column].sum().sum()
+
+ # Sum of extreme periods (weighted)
+ if len(extreme_cluster_idx_arr) > 0:
+ sum_peak = np.sum(
+ weighting_vec[extreme_cluster_idx_arr]
+ * col_data[extreme_cluster_idx_arr, :].sum(axis=1)
+ )
+ else:
+ sum_peak = 0.0
+
+ sum_clu_wo_peak = np.sum(
+ weighting_vec[idx_wo_peak] * col_data[idx_wo_peak, :].sum(axis=1)
+ )
+
+ # Define the upper scale dependent on the weighting of the series
+ scale_ub = 1.0
+ if normalize_column_means:
+ scale_ub = (
+ scale_ub * original_data[column].max() / original_data[column].mean()
+ )
+
+ # Difference between predicted and original sum
+ diff = abs(sum_raw - (sum_clu_wo_peak + sum_peak))
+
+ iteration = 0
+ while (
+ diff > sum_raw * options.rescale_tolerance
+ and iteration < options.rescale_max_iterations
+ ):
+ # Rescale values (only non-extreme clusters)
+ arr[idx_wo_peak, ci, :] *= (sum_raw - sum_peak) / sum_clu_wo_peak
+
+ # Reset values higher than the upper scale or less than zero
+ arr[:, ci, :] = np.clip(arr[:, ci, :], 0, scale_ub)
+
+ # Handle NaN (replace with 0)
+ np.nan_to_num(arr[:, ci, :], copy=False, nan=0.0)
+
+ # Calc new sum and new diff to orig data
+ col_data = arr[:, ci, :]
+ sum_clu_wo_peak = np.sum(
+ weighting_vec[idx_wo_peak] * col_data[idx_wo_peak, :].sum(axis=1)
+ )
+ diff = abs(sum_raw - (sum_clu_wo_peak + sum_peak))
+ iteration += 1
+
+ # Calculate and store final deviation
+ deviation_pct = (diff / sum_raw) * 100 if sum_raw != 0 else 0.0
+ converged = iteration < options.rescale_max_iterations
+ rescale_deviations[column] = {
+ "deviation_pct": deviation_pct,
+ "converged": converged,
+ "iterations": iteration,
+ }
+
+ if not converged and deviation_pct > 0.01:
+ warnings.warn(
+ f'Max iteration number reached for "{column}" while rescaling '
+ f"the cluster periods. The integral of the aggregated time series "
+ f"deviates by: {round(deviation_pct, 2)}%"
+ )
+
+ # Reshape back to 2D: (n_clusters, n_cols * n_timesteps)
+ return arr.reshape(n_clusters, -1), rescale_deviations
diff --git a/src/tsam/pipeline/segment.py b/src/tsam/pipeline/segment.py
new file mode 100644
index 00000000..fa41fbe0
--- /dev/null
+++ b/src/tsam/pipeline/segment.py
@@ -0,0 +1,38 @@
+"""Thin wrapper around tsam.utils.segmentation."""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+from tsam.utils.segmentation import segmentation
+
+if TYPE_CHECKING:
+ import pandas as pd
+
+ from tsam.config import SegmentConfig
+ from tsam.pipeline.types import PredefParams
+
+
+def segment_typical_periods(
+ normalized_typical_periods: pd.DataFrame,
+ n_timesteps_per_period: int,
+ segments: SegmentConfig,
+ representation_dict: dict | None,
+ predef: PredefParams | None = None,
+) -> tuple[pd.DataFrame, pd.DataFrame, list]:
+ """Segment typical periods into fewer timesteps.
+
+ Returns (segmented_df, predicted_segmented_df, segment_center_indices).
+ """
+ return segmentation( # type: ignore[no-any-return]
+ normalized_typical_periods,
+ segments.n_segments,
+ n_timesteps_per_period,
+ representation_method=segments.representation,
+ representation_dict=representation_dict,
+ predef_segment_order=predef.segment_order if predef is not None else None,
+ predef_segment_durations=predef.segment_durations
+ if predef is not None
+ else None,
+ predef_segment_centers=predef.segment_centers if predef is not None else None,
+ )
diff --git a/src/tsam/pipeline/types.py b/src/tsam/pipeline/types.py
new file mode 100644
index 00000000..b1350d39
--- /dev/null
+++ b/src/tsam/pipeline/types.py
@@ -0,0 +1,136 @@
+"""Milestone dataclasses for the pipeline."""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from functools import cached_property
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+ import numpy as np
+ import pandas as pd
+ from sklearn.preprocessing import MinMaxScaler
+
+ from tsam.config import (
+ ClusterConfig,
+ ClusteringResult,
+ ExtremeConfig,
+ SegmentConfig,
+ )
+
+
+@dataclass(frozen=True)
+class PipelineConfig:
+ """All non-data parameters for a pipeline run."""
+
+ n_clusters: int
+ n_timesteps_per_period: int
+ cluster: ClusterConfig
+ extremes: ExtremeConfig | None = None
+ segments: SegmentConfig | None = None
+ rescale_cluster_periods: bool = True
+ rescale_exclude_columns: list[str] | None = None
+ round_decimals: int | None = None
+ numerical_tolerance: float = 1e-13
+ temporal_resolution: float | None = None
+ predef: PredefParams | None = None
+
+
+@dataclass(frozen=True)
+class PredefParams:
+ """Predefined assignments for transfer/apply (skip clustering)."""
+
+ cluster_order: list | np.ndarray
+ cluster_center_indices: list[int] | np.ndarray | None = None
+ extreme_cluster_idx: list[int] | None = None
+ segment_order: list | None = None
+ segment_durations: list | None = None
+ segment_centers: list | None = None
+
+
+@dataclass(frozen=True)
+class NormalizedData:
+ """Carries everything needed for denormalization."""
+
+ values: pd.DataFrame # normalized (unweighted) time series
+ scaler: MinMaxScaler # fitted on original, reusable for inverse_transform
+ normalized_mean: pd.Series # mean before scale_by_column_means division
+ scale_by_column_means: bool # whether scale_by_column_means was applied
+
+
+@dataclass(frozen=True)
+class PeriodProfiles:
+ """The 'candidates' matrix + metadata for reconstruction."""
+
+ column_index: pd.MultiIndex # unstacked column structure
+ time_index: pd.Index # datetime index (possibly extended)
+ profiles_dataframe: pd.DataFrame # unstacked DataFrame
+ n_timesteps_per_period: int
+ n_columns: int
+ n_periods: int
+
+
+@dataclass(frozen=True)
+class PreparedData:
+ """Output of the data preparation phase (steps 1-3)."""
+
+ norm_data: NormalizedData
+ period_profiles: PeriodProfiles
+ candidates: np.ndarray
+ representation_dict: dict[str, str]
+ n_feature_cols: int
+ original_column_order: list[str]
+ original_data: (
+ pd.DataFrame
+ ) # original input data (for rescale, bounds, reconstruct)
+ weight_vector: np.ndarray | None = None
+ weighted_profiles_df: pd.DataFrame | None = None
+
+
+@dataclass(frozen=True)
+class ClusteringOutput:
+ """Output of the clustering + post-processing phase (steps 4-9)."""
+
+ cluster_periods_list: list[np.ndarray]
+ cluster_order: np.ndarray
+ cluster_counts: dict[int, float]
+ cluster_center_indices: list[int] | None
+ extreme_cluster_idx: list[int]
+ extreme_periods_info: dict[str, dict]
+ clustering_duration: float
+ rescale_deviations: dict[str, dict]
+
+
+@dataclass(frozen=True)
+class FormattedOutput:
+ """Output of the formatting + reconstruction phase (steps 10-14)."""
+
+ typical_periods: pd.DataFrame
+ reconstructed_data: pd.DataFrame
+ normalized_predicted: pd.DataFrame
+ segmented_df: pd.DataFrame | None
+ segment_center_indices: list | None
+
+
+@dataclass(frozen=True)
+class PipelineResult:
+ """Single handoff from pipeline to api.py / config.py."""
+
+ typical_periods: pd.DataFrame # denormalized, MultiIndex (cluster, timestep)
+ cluster_counts: dict[int, float]
+ n_timesteps_per_period: int
+ time_index: pd.Index
+ original_data: pd.DataFrame
+ clustering_duration: float
+ rescale_deviations: dict[str, dict]
+ segmented_df: pd.DataFrame | None # segmentedNormalizedTypicalPeriods
+ reconstructed_data: pd.DataFrame
+ _norm_values: pd.DataFrame
+ _normalized_predicted: pd.DataFrame
+ clustering_result: ClusteringResult
+
+ @cached_property
+ def accuracy_indicators(self) -> pd.DataFrame:
+ from tsam.pipeline.accuracy import compute_accuracy
+
+ return compute_accuracy(self._norm_values, self._normalized_predicted)
diff --git a/src/tsam/plot.py b/src/tsam/plot.py
index f0a86458..1bad8268 100644
--- a/src/tsam/plot.py
+++ b/src/tsam/plot.py
@@ -159,7 +159,7 @@ def cluster_representatives(
go.Figure
"""
typ = self._result.cluster_representatives
- weights = self._result.cluster_weights
+ weights = self._result.cluster_counts
available_columns = [c for c in typ.columns if c not in ["cluster", "timestep"]]
columns = _validate_columns(
@@ -457,7 +457,7 @@ def cluster_weights(self, title: str = "Cluster Weights") -> go.Figure:
-------
go.Figure
"""
- weights = self._result.cluster_weights
+ weights = self._result.cluster_counts
df = pd.DataFrame(
{
"Period": [f"Period {p}" for p in weights],
diff --git a/src/tsam/representations.py b/src/tsam/representations.py
index 02fc15d7..f56da4c1 100644
--- a/src/tsam/representations.py
+++ b/src/tsam/representations.py
@@ -1,177 +1,195 @@
+from __future__ import annotations
+
import numpy as np
from sklearn.metrics.pairwise import euclidean_distances
-from tsam.utils.durationRepresentation import durationRepresentation
+from tsam.config import Distribution, MinMaxMean
+from tsam.utils.duration_representation import duration_representation
+
+# Aliases: old verbose names → new short names.
+# The legacy wrapper sends old names; the pipeline sends new names.
+_ALIASES = {
+ "meanRepresentation": "mean",
+ "medoidRepresentation": "medoid",
+ "maxoidRepresentation": "maxoid",
+ "minmaxmeanRepresentation": "minmax_mean",
+ "durationRepresentation": "distribution",
+ "distributionRepresentation": "distribution",
+ "distributionAndMinMaxRepresentation": "distribution_minmax",
+}
def representations(
- candidates,
- clusterOrder,
- default,
- representationMethod=None,
- representationDict=None,
- distributionPeriodWise=True,
- timeStepsPerPeriod=None,
-):
- clusterCenterIndices = None
- if representationMethod is None:
- representationMethod = default
- if representationMethod == "meanRepresentation":
- clusterCenters = meanRepresentation(candidates, clusterOrder)
- elif representationMethod == "medoidRepresentation":
- clusterCenters, clusterCenterIndices = medoidRepresentation(
- candidates, clusterOrder
+ candidates: np.ndarray,
+ cluster_order: np.ndarray,
+ default: str,
+ representation_method: str | Distribution | MinMaxMean | None = None,
+ representation_dict: dict[str, str] | None = None,
+ distribution_period_wise: bool = True,
+ n_timesteps_per_period: int | None = None,
+) -> tuple[list[np.ndarray], list[int] | None]:
+ cluster_center_indices = None
+ if representation_method is None:
+ representation_method = default
+
+ # --- Dispatch on Representation objects first ---
+ if isinstance(representation_method, Distribution):
+ period_wise = representation_method.scope == "cluster"
+ cluster_centers = duration_representation(
+ candidates,
+ cluster_order,
+ period_wise,
+ n_timesteps_per_period,
+ represent_min_max=representation_method.preserve_minmax,
+ )
+ return cluster_centers, cluster_center_indices
+
+ if isinstance(representation_method, MinMaxMean):
+ cluster_centers = minmax_mean_representation(
+ candidates,
+ cluster_order,
+ representation_dict, # type: ignore[arg-type]
+ n_timesteps_per_period, # type: ignore[arg-type]
+ )
+ return cluster_centers, cluster_center_indices
+
+ # --- Fallback: string-based dispatch (legacy wrapper compat) ---
+ # Normalize old names to new names
+ representation_method = _ALIASES.get(representation_method, representation_method)
+ if representation_method == "mean":
+ cluster_centers = mean_representation(candidates, cluster_order)
+ elif representation_method == "medoid":
+ cluster_centers, cluster_center_indices = medoid_representation(
+ candidates, cluster_order
)
- elif representationMethod == "maxoidRepresentation":
- clusterCenters, clusterCenterIndices = maxoidRepresentation(
- candidates, clusterOrder
+ elif representation_method == "maxoid":
+ cluster_centers, cluster_center_indices = maxoid_representation(
+ candidates, cluster_order
)
- elif representationMethod == "minmaxmeanRepresentation":
- clusterCenters = minmaxmeanRepresentation(
- candidates, clusterOrder, representationDict, timeStepsPerPeriod
+ elif representation_method == "minmax_mean":
+ cluster_centers = minmax_mean_representation(
+ candidates,
+ cluster_order,
+ representation_dict, # type: ignore[arg-type]
+ n_timesteps_per_period, # type: ignore[arg-type]
)
- elif (
- representationMethod == "durationRepresentation"
- or representationMethod == "distributionRepresentation"
- ):
- clusterCenters = durationRepresentation(
+ elif representation_method == "distribution":
+ cluster_centers = duration_representation(
candidates,
- clusterOrder,
- distributionPeriodWise,
- timeStepsPerPeriod,
- representMinMax=False,
+ cluster_order,
+ distribution_period_wise,
+ n_timesteps_per_period,
+ represent_min_max=False,
)
- elif representationMethod == "distributionAndMinMaxRepresentation":
- clusterCenters = durationRepresentation(
+ elif representation_method == "distribution_minmax":
+ cluster_centers = duration_representation(
candidates,
- clusterOrder,
- distributionPeriodWise,
- timeStepsPerPeriod,
- representMinMax=True,
+ cluster_order,
+ distribution_period_wise,
+ n_timesteps_per_period,
+ represent_min_max=True,
)
else:
raise ValueError("Chosen 'representationMethod' does not exist.")
- return clusterCenters, clusterCenterIndices
+ return cluster_centers, cluster_center_indices
-def maxoidRepresentation(candidates, clusterOrder):
+def maxoid_representation(
+ candidates: np.ndarray,
+ cluster_order: np.ndarray,
+) -> tuple[list[np.ndarray], list[int]]:
"""
- Represents the candidates of a given cluster group (clusterOrder)
- by its medoid, measured with the euclidean distance.
-
- :param candidates: Dissimilarity matrix where each row represents a candidate. required
- :type candidates: np.ndarray
-
- :param clusterOrder: Integer array where the index refers to the candidate and the
- Integer entry to the group. required
- :type clusterOrder: np.array
+ Represents the candidates of a given cluster group (cluster_order)
+ by its maxoid, measured with the euclidean distance.
"""
# set cluster member that is farthest away from the points of the other clusters as maxoid
- clusterCenters = []
- clusterCenterIndices = []
- for clusterNum in np.unique(clusterOrder):
- indice = np.where(clusterOrder == clusterNum)
- innerDistMatrix = euclidean_distances(candidates, candidates[indice])
- mindistIdx = np.argmax(innerDistMatrix.sum(axis=0))
- clusterCenters.append(candidates[indice][mindistIdx])
- clusterCenterIndices.append(indice[0][mindistIdx])
-
- return clusterCenters, clusterCenterIndices
-
-
-def medoidRepresentation(candidates, clusterOrder):
+ cluster_centers = []
+ cluster_center_indices = []
+ for cluster_num in np.unique(cluster_order):
+ indices = np.where(cluster_order == cluster_num)
+ inner_dist_matrix = euclidean_distances(candidates, candidates[indices])
+ min_dist_idx = np.argmax(inner_dist_matrix.sum(axis=0))
+ cluster_centers.append(candidates[indices][min_dist_idx])
+ cluster_center_indices.append(indices[0][min_dist_idx])
+
+ return cluster_centers, cluster_center_indices
+
+
+def medoid_representation(
+ candidates: np.ndarray,
+ cluster_order: np.ndarray,
+) -> tuple[list[np.ndarray], list[int]]:
"""
- Represents the candidates of a given cluster group (clusterOrder)
+ Represents the candidates of a given cluster group (cluster_order)
by its medoid, measured with the euclidean distance.
-
- :param candidates: Dissimilarity matrix where each row represents a candidate. required
- :type candidates: np.ndarray
-
- :param clusterOrder: Integer array where the index refers to the candidate and the
- Integer entry to the group. required
- :type clusterOrder: np.array
"""
# set cluster center as medoid
- clusterCenters = []
- clusterCenterIndices = []
- for clusterNum in np.unique(clusterOrder):
- indice = np.where(clusterOrder == clusterNum)
- innerDistMatrix = euclidean_distances(candidates[indice])
- mindistIdx = np.argmin(innerDistMatrix.sum(axis=0))
- clusterCenters.append(candidates[indice][mindistIdx])
- clusterCenterIndices.append(indice[0][mindistIdx])
-
- return clusterCenters, clusterCenterIndices
-
-
-def meanRepresentation(candidates, clusterOrder):
+ cluster_centers = []
+ cluster_center_indices = []
+ for cluster_num in np.unique(cluster_order):
+ indices = np.where(cluster_order == cluster_num)
+ inner_dist_matrix = euclidean_distances(candidates[indices])
+ min_dist_idx = np.argmin(inner_dist_matrix.sum(axis=0))
+ cluster_centers.append(candidates[indices][min_dist_idx])
+ cluster_center_indices.append(indices[0][min_dist_idx])
+
+ return cluster_centers, cluster_center_indices
+
+
+def mean_representation(
+ candidates: np.ndarray,
+ cluster_order: np.ndarray,
+) -> list[np.ndarray]:
"""
- Represents the candidates of a given cluster group (clusterOrder)
+ Represents the candidates of a given cluster group (cluster_order)
by its mean.
-
- :param candidates: Dissimilarity matrix where each row represents a candidate. required
- :type candidates: np.ndarray
-
- :param clusterOrder: Integer array where the index refers to the candidate and the
- Integer entry to the group. required
- :type clusterOrder: np.array
"""
# set cluster centers as means of the group candidates
- clusterCenters = []
- for clusterNum in np.unique(clusterOrder):
- indice = np.where(clusterOrder == clusterNum)
- currentMean = candidates[indice].mean(axis=0)
- clusterCenters.append(currentMean)
- return clusterCenters
-
-
-def minmaxmeanRepresentation(
- candidates, clusterOrder, representationDict, timeStepsPerPeriod
-):
+ cluster_centers = []
+ for cluster_num in np.unique(cluster_order):
+ indices = np.where(cluster_order == cluster_num)
+ current_mean = candidates[indices].mean(axis=0)
+ cluster_centers.append(current_mean)
+ return cluster_centers
+
+
+def minmax_mean_representation(
+ candidates: np.ndarray,
+ cluster_order: np.ndarray,
+ representation_dict: dict[str, str],
+ n_timesteps_per_period: int,
+) -> list[np.ndarray]:
"""
- Represents the candidates of a given cluster group (clusterOrder)
+ Represents the candidates of a given cluster group (cluster_order)
by either the minimum, the maximum or the mean values of each time step for
all periods in that cluster depending on the command for each attribute.
-
- :param candidates: Dissimilarity matrix where each row represents a candidate. required
- :type candidates: np.ndarray
-
- :param clusterOrder: Integer array where the index refers to the candidate and the
- Integer entry to the group. required
- :type clusterOrder: np.array
-
- :param representationDict: A dictionary which defines for each attribute whether the typical
- period should be represented by the minimum or maximum values within each cluster.
- optional (default: None)
- :type representationDict: dictionary
-
- :param timeStepsPerPeriod: The number of discrete timesteps which describe one period. required
- :type timeStepsPerPeriod: integer
"""
- # set cluster center depending of the representationDict
- clusterCenters = []
- for clusterNum in np.unique(clusterOrder):
- indice = np.where(clusterOrder == clusterNum)
- currentClusterCenter = np.zeros(len(representationDict) * timeStepsPerPeriod)
- for attributeNum in range(len(representationDict)):
- startIdx = attributeNum * timeStepsPerPeriod
- endIdx = (attributeNum + 1) * timeStepsPerPeriod
- if list(representationDict.values())[attributeNum] == "min":
- currentClusterCenter[startIdx:endIdx] = candidates[
- indice, startIdx:endIdx
+ cluster_centers = []
+ rep_values = list(representation_dict.values())
+ for cluster_num in np.unique(cluster_order):
+ indices = np.where(cluster_order == cluster_num)
+ current_cluster_center = np.zeros(
+ len(representation_dict) * n_timesteps_per_period
+ )
+ for attribute_num, rep in enumerate(rep_values):
+ start_idx = attribute_num * n_timesteps_per_period
+ end_idx = (attribute_num + 1) * n_timesteps_per_period
+ if rep == "min":
+ current_cluster_center[start_idx:end_idx] = candidates[
+ indices, start_idx:end_idx
].min(axis=1)
- elif list(representationDict.values())[attributeNum] == "max":
- currentClusterCenter[startIdx:endIdx] = candidates[
- indice, startIdx:endIdx
+ elif rep == "max":
+ current_cluster_center[start_idx:end_idx] = candidates[
+ indices, start_idx:end_idx
].max(axis=1)
- elif list(representationDict.values())[attributeNum] == "mean":
- currentClusterCenter[startIdx:endIdx] = candidates[
- indice, startIdx:endIdx
+ elif rep == "mean":
+ current_cluster_center[start_idx:end_idx] = candidates[
+ indices, start_idx:end_idx
].mean(axis=1)
else:
raise ValueError(
'At least one value in the representationDict is neither "min", "max" nor "mean".'
)
- clusterCenters.append(currentClusterCenter)
- return clusterCenters
+ cluster_centers.append(current_cluster_center)
+ return cluster_centers
diff --git a/src/tsam/result.py b/src/tsam/result.py
index d6218d86..fa960ca7 100644
--- a/src/tsam/result.py
+++ b/src/tsam/result.py
@@ -2,6 +2,7 @@
from __future__ import annotations
+import warnings
from dataclasses import dataclass, field
from functools import cached_property
from typing import TYPE_CHECKING, cast
@@ -12,7 +13,6 @@
if TYPE_CHECKING:
from tsam.config import ClusteringResult
from tsam.plot import ResultPlotAccessor
- from tsam.timeseriesaggregation import TimeSeriesAggregation
@dataclass
@@ -114,9 +114,10 @@ class AggregationResult:
Length equals the number of original periods.
Values are cluster indices (0 to n_clusters-1).
- cluster_weights : dict[int, int]
+ cluster_counts : dict[int, float]
How many original periods each cluster represents.
Keys are cluster indices, values are occurrence counts.
+ Values can be fractional due to partial-period adjustment.
n_clusters : int
Number of clusters (typical periods).
@@ -152,7 +153,7 @@ class AggregationResult:
1 0.15 0.42 0.82
...
- >>> result.cluster_weights
+ >>> result.cluster_counts
{0: 45, 1: 52, 2: 38, ...}
>>> result.accuracy.rmse
@@ -163,21 +164,63 @@ class AggregationResult:
"""
cluster_representatives: pd.DataFrame
- cluster_weights: dict[int, int]
+ cluster_counts: dict[int, float]
n_timesteps_per_period: int
segment_durations: tuple[tuple[int, ...], ...] | None
- accuracy: AccuracyMetrics
clustering_duration: float
clustering: ClusteringResult
is_transferred: bool
- _aggregation: TimeSeriesAggregation = field(repr=False, compare=False)
+ _original_data: pd.DataFrame = field(repr=False, compare=False)
+ _reconstructed_data: pd.DataFrame = field(repr=False, compare=False)
+ _time_index: pd.Index = field(repr=False, compare=False)
+ _accuracy_metrics: AccuracyMetrics | None = field(
+ default=None, repr=False, compare=False
+ )
+ _norm_values: pd.DataFrame | None = field(default=None, repr=False, compare=False)
+ _normalized_predicted: pd.DataFrame | None = field(
+ default=None, repr=False, compare=False
+ )
+ _rescale_deviations: pd.DataFrame = field(
+ default_factory=lambda: pd.DataFrame(
+ columns=["deviation_pct", "converged", "iterations"]
+ ),
+ repr=False,
+ compare=False,
+ )
+ _segmented_df: pd.DataFrame | None = field(default=None, repr=False, compare=False)
+ _weights: dict[str, float] | None = field(default=None, repr=False, compare=False)
+
+ @cached_property
+ def accuracy(self) -> AccuracyMetrics:
+ """Accuracy metrics comparing reconstructed to original data.
+
+ Computed lazily on first access.
+ """
+ if self._accuracy_metrics is not None:
+ return self._accuracy_metrics
+ from tsam.api import _weighted_mean, _weighted_rms
+ from tsam.pipeline.accuracy import compute_accuracy
+
+ assert self._norm_values is not None and self._normalized_predicted is not None
+ accuracy_df = compute_accuracy(self._norm_values, self._normalized_predicted)
+ return AccuracyMetrics(
+ rmse=accuracy_df["RMSE"],
+ mae=accuracy_df["MAE"],
+ rmse_duration=accuracy_df["RMSE_duration"],
+ rescale_deviations=self._rescale_deviations,
+ weighted_rmse=_weighted_rms(accuracy_df["RMSE"], self._weights),
+ weighted_mae=_weighted_mean(accuracy_df["MAE"], self._weights),
+ weighted_rmse_duration=_weighted_rms(
+ accuracy_df["RMSE_duration"], self._weights
+ ),
+ )
@cached_property
def n_clusters(self) -> int:
"""Number of clusters (typical periods).
Derived from the cluster_representatives DataFrame index,
- which is the authoritative source. Note: cluster_weights may
+ which is the authoritative source. Note: cluster_counts may
have more entries than actual cluster IDs due to tsam quirks.
"""
return self.cluster_representatives.index.get_level_values(0).nunique()
@@ -196,6 +239,16 @@ def cluster_assignments(self) -> np.ndarray:
"""
return np.array(self.clustering.cluster_assignments)
+ @property
+ def cluster_weights(self) -> dict[int, float]:
+ """Deprecated: use cluster_counts instead."""
+ warnings.warn(
+ "'cluster_weights' is deprecated, use 'cluster_counts'.",
+ FutureWarning,
+ stacklevel=2,
+ )
+ return self.cluster_counts
+
def __repr__(self) -> str:
seg_info = f", n_segments={self.n_segments}" if self.n_segments else ""
transferred_info = ", is_transferred=True" if self.is_transferred else ""
@@ -222,14 +275,13 @@ def original(self) -> pd.DataFrame:
>>> result.original.shape == df.shape
True
"""
- return cast("pd.DataFrame", self._aggregation.timeSeries)
+ return self._original_data
@cached_property
def reconstructed(self) -> pd.DataFrame:
"""Reconstructed time series from typical periods.
Each original period is replaced by its assigned cluster representative.
- This is cached for performance since reconstruction can be expensive.
Returns
-------
@@ -242,7 +294,7 @@ def reconstructed(self) -> pd.DataFrame:
>>> result.reconstructed.shape == df.shape
True
"""
- return cast("pd.DataFrame", self._aggregation.predictOriginalData())
+ return self._reconstructed_data
def disaggregate(self, data: pd.DataFrame) -> pd.DataFrame:
"""Expand typical-period data back to the original time series length.
@@ -305,7 +357,7 @@ def to_dict(self) -> dict:
return {
"cluster_representatives": self.cluster_representatives.to_dict(),
"cluster_assignments": self.cluster_assignments.tolist(),
- "cluster_weights": self.cluster_weights,
+ "cluster_counts": self.cluster_counts,
"n_clusters": self.n_clusters,
"n_timesteps_per_period": self.n_timesteps_per_period,
"n_segments": self.n_segments,
@@ -350,7 +402,7 @@ def period_index(self) -> list[int]:
"""
return sorted(self.cluster_representatives.index.get_level_values(0).unique())
- @property
+ @cached_property
def assignments(self) -> pd.DataFrame:
"""Get timestep-level assignment information.
@@ -385,8 +437,6 @@ def assignments(self) -> pd.DataFrame:
>>> # Save and reload assignments
>>> result.assignments.to_csv("assignments.csv")
"""
- agg = self._aggregation
-
# Build period_idx and timestep_idx for each original timestep
period_indices = []
timestep_indices = []
@@ -404,21 +454,16 @@ def assignments(self) -> pd.DataFrame:
"timestep_idx": timestep_indices,
"cluster_idx": cluster_indices,
},
- index=agg.timeIndex,
+ index=self._time_index,
)
# Add segment_idx if segmentation was used
- if self.n_segments is not None and hasattr(
- agg, "segmentedNormalizedTypicalPeriods"
- ):
+ if self.n_segments is not None and self._segmented_df is not None:
segment_indices = []
for cluster_idx in self.cluster_assignments:
- # Get segment structure for this cluster's typical period
- segment_data = agg.segmentedNormalizedTypicalPeriods.loc[cluster_idx]
- # Segment Step is level 0, Segment Duration is level 1
+ segment_data = self._segmented_df.loc[cluster_idx]
segment_steps = segment_data.index.get_level_values(0)
segment_durations = segment_data.index.get_level_values(1)
- # Repeat each segment index by its duration
segment_indices.extend(
np.repeat(segment_steps, segment_durations).tolist()
)
@@ -426,7 +471,7 @@ def assignments(self) -> pd.DataFrame:
return result_df
- @property
+ @cached_property
def plot(self) -> ResultPlotAccessor:
"""Access plotting methods.
diff --git a/src/tsam/timeseriesaggregation.py b/src/tsam/timeseriesaggregation.py
index df4262d0..1540bda0 100644
--- a/src/tsam/timeseriesaggregation.py
+++ b/src/tsam/timeseriesaggregation.py
@@ -1,47 +1,41 @@
import copy
-import time
import warnings
import numpy as np
import pandas as pd
-from sklearn import preprocessing
-from sklearn.metrics import mean_absolute_error, mean_squared_error
+from tsam.config import (
+ ClusterConfig,
+ Distribution,
+ ExtremeConfig,
+ MinMaxMean,
+ SegmentConfig,
+)
from tsam.exceptions import LegacyAPIWarning
-from tsam.periodAggregation import aggregatePeriods
-from tsam.representations import representations
+from tsam.period_aggregation import aggregate_periods # noqa: F401 (re-exported)
+from tsam.pipeline import run_pipeline
+from tsam.pipeline.types import PipelineConfig, PredefParams
+from tsam.representations import representations # noqa: F401 (re-exported)
+from tsam.weights import validate_weights
pd.set_option("mode.chained_assignment", None)
-# max iterator while resacling cluster profiles
-MAX_ITERATOR = 20
-# tolerance while rescaling cluster periods to meet the annual sum of the original profile
-TOLERANCE = 1e-6
-
-
-# minimal weight that overwrites a weighting of zero in order to carry the profile through the aggregation process
-MIN_WEIGHT = 1e-6
-
-
-def unstackToPeriods(timeSeries, timeStepsPerPeriod):
+def unstack_to_periods(time_series, time_steps_per_period):
"""
Extend the timeseries to an integer multiple of the period length and
groups the time series to the periods.
- :param timeSeries:
- :type timeSeries: pandas DataFrame
+ :param time_series:
+ :type time_series: pandas DataFrame
- :param timeStepsPerPeriod: The number of discrete timesteps which describe one period. required
- :type timeStepsPerPeriod: integer
+ :param time_steps_per_period: The number of discrete timesteps which describe one period. required
+ :type time_steps_per_period: integer
- :returns: - **unstackedTimeSeries** (pandas DataFrame) -- is stacked such that each row represents a
+ :returns: - **unstacked_time_series** (pandas DataFrame) -- is stacked such that each row represents a
candidate period
- - **timeIndex** (pandas Series index) -- is the modification of the original
+ - **time_index** (pandas Series index) -- is the modification of the original
timeseriesindex in case an integer multiple was created
-
- .. deprecated::
- Use :func:`tsam.unstack_to_periods` instead.
"""
warnings.warn(
"unstackToPeriods will be removed in tsam v4.0. Use tsam.unstack_to_periods() instead.",
@@ -49,40 +43,102 @@ def unstackToPeriods(timeSeries, timeStepsPerPeriod):
stacklevel=2,
)
# init new grouped timeindex
- unstackedTimeSeries = timeSeries.copy()
+ unstacked_time_series = time_series.copy()
# initialize new indices
- periodIndex = []
- stepIndex = []
+ period_index = []
+ step_index = []
- # extend to inger multiple of period length
- if len(timeSeries) % timeStepsPerPeriod == 0:
+ # extend to integer multiple of period length
+ if len(time_series) % time_steps_per_period == 0:
attached_timesteps = 0
else:
# calculate number of timesteps which get attached
- attached_timesteps = timeStepsPerPeriod - len(timeSeries) % timeStepsPerPeriod
+ attached_timesteps = (
+ time_steps_per_period - len(time_series) % time_steps_per_period
+ )
# take these from the head of the original time series
- rep_data = unstackedTimeSeries.head(attached_timesteps)
+ rep_data = unstacked_time_series.head(attached_timesteps)
# append them at the end of the time series
- unstackedTimeSeries = pd.concat([unstackedTimeSeries, rep_data])
+ unstacked_time_series = pd.concat([unstacked_time_series, rep_data])
# create period and step index
- for ii in range(0, len(unstackedTimeSeries)):
- periodIndex.append(int(ii / timeStepsPerPeriod))
- stepIndex.append(ii - int(ii / timeStepsPerPeriod) * timeStepsPerPeriod)
+ for ii in range(0, len(unstacked_time_series)):
+ period_index.append(int(ii / time_steps_per_period))
+ step_index.append(ii - int(ii / time_steps_per_period) * time_steps_per_period)
# save old index
- timeIndex = copy.deepcopy(unstackedTimeSeries.index)
+ time_index = copy.deepcopy(unstacked_time_series.index)
# create new double index and unstack the time series
- unstackedTimeSeries.index = pd.MultiIndex.from_arrays(
- [stepIndex, periodIndex], names=["TimeStep", "PeriodNum"]
+ unstacked_time_series.index = pd.MultiIndex.from_arrays(
+ [step_index, period_index], names=["TimeStep", "PeriodNum"]
)
- unstackedTimeSeries = unstackedTimeSeries.unstack(level="TimeStep")
-
- return unstackedTimeSeries, timeIndex
+ unstacked_time_series = unstacked_time_series.unstack(level="TimeStep")
+
+ return unstacked_time_series, time_index
+
+
+# Legacy alias
+unstackToPeriods = unstack_to_periods
+
+
+_PARAM_ALIASES = {
+ "timeSeries": "time_series",
+ "noTypicalPeriods": "no_typical_periods",
+ "noSegments": "no_segments",
+ "hoursPerPeriod": "hours_per_period",
+ "clusterMethod": "cluster_method",
+ "evalSumPeriods": "eval_sum_periods",
+ "sortValues": "sort_values",
+ "sameMean": "same_mean",
+ "rescaleClusterPeriods": "rescale_cluster_periods",
+ "rescaleExcludeColumns": "rescale_exclude_columns",
+ "weightDict": "weight_dict",
+ "extremePeriodMethod": "extreme_period_method",
+ "representationMethod": "representation_method",
+ "representationDict": "representation_dict",
+ "distributionPeriodWise": "distribution_period_wise",
+ "segmentRepresentationMethod": "segment_representation_method",
+ "predefClusterOrder": "predef_cluster_order",
+ "predefClusterCenterIndices": "predef_cluster_center_indices",
+ "predefExtremeClusterIdx": "predef_extreme_cluster_idx",
+ "predefSegmentOrder": "predef_segment_order",
+ "predefSegmentDurations": "predef_segment_durations",
+ "predefSegmentCenters": "predef_segment_centers",
+ "numericalTolerance": "numerical_tolerance",
+ "roundOutput": "round_output",
+ "addPeakMin": "add_peak_min",
+ "addPeakMax": "add_peak_max",
+ "addMeanMin": "add_mean_min",
+ "addMeanMax": "add_mean_max",
+}
+
+
+# Translation maps from old API names to new API names
+_CLUSTER_METHOD_MAP = {
+ "k_means": "kmeans",
+ "k_medoids": "kmedoids",
+ "k_maxoids": "kmaxoids",
+ "adjacent_periods": "contiguous",
+ "averaging": "averaging",
+ "hierarchical": "hierarchical",
+}
+
+_REPR_METHOD_MAP = {
+ "meanRepresentation": "mean",
+ "medoidRepresentation": "medoid",
+ "maxoidRepresentation": "maxoid",
+}
+
+_EXTREME_METHOD_MAP = {
+ "None": None,
+ "append": "append",
+ "new_cluster_center": "new_cluster",
+ "replace_cluster_center": "replace",
+}
class TimeSeriesAggregation:
@@ -118,309 +174,317 @@ class TimeSeriesAggregation:
def __init__(
self,
- timeSeries,
+ time_series=None,
resolution=None,
- noTypicalPeriods=10,
- noSegments=10,
- hoursPerPeriod=24,
- clusterMethod="hierarchical",
- evalSumPeriods=False,
- sortValues=False,
- sameMean=False,
- rescaleClusterPeriods=True,
- rescaleExcludeColumns=None,
- weightDict=None,
+ no_typical_periods=10,
+ no_segments=10,
+ hours_per_period=24,
+ cluster_method="hierarchical",
+ eval_sum_periods=False,
+ sort_values=False,
+ same_mean=False,
+ rescale_cluster_periods=True,
+ rescale_exclude_columns=None,
+ weight_dict=None,
segmentation=False,
- extremePeriodMethod="None",
- representationMethod=None,
- representationDict=None,
- distributionPeriodWise=True,
- segmentRepresentationMethod=None,
- predefClusterOrder=None,
- predefClusterCenterIndices=None,
- predefExtremeClusterIdx=None,
- predefSegmentOrder=None,
- predefSegmentDurations=None,
- predefSegmentCenters=None,
+ extreme_period_method="None",
+ representation_method=None,
+ representation_dict=None,
+ distribution_period_wise=True,
+ segment_representation_method=None,
+ predef_cluster_order=None,
+ predef_cluster_center_indices=None,
+ predef_extreme_cluster_idx=None,
+ predef_segment_order=None,
+ predef_segment_durations=None,
+ predef_segment_centers=None,
solver="highs",
- numericalTolerance=1e-13,
- roundOutput=None,
- addPeakMin=None,
- addPeakMax=None,
- addMeanMin=None,
- addMeanMax=None,
+ numerical_tolerance=1e-13,
+ round_output=None,
+ add_peak_min=None,
+ add_peak_max=None,
+ add_mean_min=None,
+ add_mean_max=None,
+ **kwargs,
):
"""
Initialize the periodly clusters.
- :param timeSeries: DataFrame with the datetime as index and the relevant
+ :param time_series: DataFrame with the datetime as index and the relevant
time series parameters as columns. required
- :type timeSeries: pandas.DataFrame() or dict
+ :type time_series: pandas.DataFrame() or dict
- :param resolution: Resolution of the time series in hours [h]. If timeSeries is a
+ :param resolution: Resolution of the time series in hours [h]. If time_series is a
pandas.DataFrame() the resolution is derived from the datetime
- index. optional, default: delta_T in timeSeries
+ index. optional, default: delta_T in time_series
:type resolution: float
- :param hoursPerPeriod: Value which defines the length of a cluster period. optional, default: 24
- :type hoursPerPeriod: integer
+ :param hours_per_period: Value which defines the length of a cluster period. optional, default: 24
+ :type hours_per_period: integer
- :param noTypicalPeriods: Number of typical Periods - equivalent to the number of clusters. optional, default: 10
- :type noTypicalPeriods: integer
+ :param no_typical_periods: Number of typical Periods - equivalent to the number of clusters. optional, default: 10
+ :type no_typical_periods: integer
- :param noSegments: Number of segments in which the typical periods shoul be subdivided - equivalent to the
+ :param no_segments: Number of segments in which the typical periods should be subdivided - equivalent to the
number of inner-period clusters. optional, default: 10
- :type noSegments: integer
-
- :param clusterMethod: Chosen clustering method. optional, default: 'hierarchical'
- |br| Options are:
+ :type no_segments: integer
- * 'averaging'
- * 'k_means'
- * 'k_medoids'
- * 'k_maxoids'
- * 'hierarchical'
- * 'adjacent_periods'
- :type clusterMethod: string
+ :param cluster_method: Chosen clustering method. optional, default: 'hierarchical'
+ :type cluster_method: string
- :param evalSumPeriods: Boolean if in the clustering process also the averaged periodly values
+ :param eval_sum_periods: Boolean if in the clustering process also the averaged periodly values
shall be integrated additional to the periodly profiles as parameters. optional, default: False
- :type evalSumPeriods: boolean
+ :type eval_sum_periods: boolean
- :param sameMean: Boolean which is used in the normalization procedure. If true, all time series get normalized
+ :param same_mean: Boolean which is used in the normalization procedure. If true, all time series get normalized
such that they have the same mean value. optional, default: False
- :type sameMean: boolean
+ :type same_mean: boolean
- :param sortValues: Boolean if the clustering should be done by the periodly duration
+ :param sort_values: Boolean if the clustering should be done by the periodly duration
curves (true) or the original shape of the data. optional (default: False)
- :type sortValues: boolean
+ :type sort_values: boolean
- :param rescaleClusterPeriods: Decides if the cluster Periods shall get rescaled such that their
+ :param rescale_cluster_periods: Decides if the cluster Periods shall get rescaled such that their
weighted mean value fits the mean value of the original time series. optional (default: True)
- :type rescaleClusterPeriods: boolean
-
- :param weightDict: Dictionary which weights the profiles. It is done by scaling
- the time series while the normalization process. Normally all time
- series have a scale from 0 to 1. By scaling them, the values get
- different distances to each other and with this, they are
- differently evaluated while the clustering process. optional (default: None )
- :type weightDict: dict
-
- :param segmentation: Boolean if time steps in periods should be aggregated to segments. optional (default: False)
- :type segmentation: boolean
-
- :param extremePeriodMethod: Method how to integrate extreme Periods (peak demand, lowest temperature etc.)
- into to the typical period profiles. optional, default: 'None'
- |br| Options are:
-
- * None: No integration at all.
- * 'append': append typical Periods to cluster centers
- * 'new_cluster_center': add the extreme period as additional cluster center. It is checked then for all
- Periods if they fit better to the this new center or their original cluster center.
- * 'replace_cluster_center': replaces the cluster center of the
- cluster where the extreme period belongs to with the periodly profile of the extreme period. (Worst
- case system design)
- :type extremePeriodMethod: string
-
- :param representationMethod: Chosen representation. If specified, the clusters are represented in the chosen
- way. Otherwise, each clusterMethod has its own commonly used default representation method.
- |br| Options are:
-
- * 'meanRepresentation' (default of 'averaging' and 'k_means')
- * 'medoidRepresentation' (default of 'k_medoids', 'hierarchical' and 'adjacent_periods')
- * 'minmaxmeanRepresentation'
- * 'durationRepresentation'/ 'distributionRepresentation'
- * 'distribtionAndMinMaxRepresentation'
- :type representationMethod: string
-
- :param representationDict: Dictionary which states for each attribute whether the profiles in each cluster
- should be represented by the minimum value or maximum value of each time step. This enables estimations
- to the safe side. This dictionary is needed when 'minmaxmeanRepresentation' is chosen. If not specified, the
- dictionary is set to containing 'mean' values only.
- :type representationDict: dict
-
- :param distributionPeriodWise: If durationRepresentation is chosen, you can choose whether the distribution of
- each cluster should be separately preserved or that of the original time series only (default: True)
- :type distributionPeriodWise:
+ :type rescale_cluster_periods: boolean
- :param segmentRepresentationMethod: Chosen representation for the segments. If specified, the segments are
- represented in the chosen way. Otherwise, it is inherited from the representationMethod.
- |br| Options are:
+ :param weight_dict: Dictionary which weights the profiles. optional (default: None)
+ :type weight_dict: dict
- * 'meanRepresentation' (default of 'averaging' and 'k_means')
- * 'medoidRepresentation' (default of 'k_medoids', 'hierarchical' and 'adjacent_periods')
- * 'minmaxmeanRepresentation'
- * 'durationRepresentation'/ 'distributionRepresentation'
- * 'distribtionAndMinMaxRepresentation'
- :type segmentRepresentationMethod: string
+ :param extreme_period_method: Method how to integrate extreme Periods. optional, default: 'None'
+ :type extreme_period_method: string
- :param predefClusterOrder: Instead of aggregating a time series, a predefined grouping is taken
- which is given by this list. optional (default: None)
- :type predefClusterOrder: list or array
+ :param representation_method: Chosen representation. optional
+ :type representation_method: string
- :param predefClusterCenterIndices: If predefClusterOrder is give, this list can define the representative
- cluster candidates. Otherwise the medoid is taken. optional (default: None)
- :type predefClusterCenterIndices: list or array
+ :param representation_dict: Dictionary which states for each attribute whether the profiles in each cluster
+ should be represented by the minimum value or maximum value of each time step.
+ :type representation_dict: dict
- :param solver: Solver that is used for k_medoids clustering. optional (default: 'cbc' )
- :type solver: string
+ :param distribution_period_wise: If duration representation is chosen, you can choose whether the distribution of
+ each cluster should be separately preserved or that of the original time series only (default: True)
+ :type distribution_period_wise: boolean
- :param numericalTolerance: Tolerance for numerical issues. Silences the warning for exceeding upper or lower bounds
- of the time series. optional (default: 1e-13 )
- :type numericalTolerance: float
+ :param numerical_tolerance: Tolerance for numerical issues. optional (default: 1e-13)
+ :type numerical_tolerance: float
- :param roundOutput: Decimals to what the output time series get round. optional (default: None )
- :type roundOutput: integer
+ :param round_output: Decimals to what the output time series get round. optional (default: None)
+ :type round_output: integer
- :param addPeakMin: List of column names which's minimal value shall be added to the
- typical periods. E.g.: ['Temperature']. optional, default: []
- :type addPeakMin: list
+ :param add_peak_min: List of column names which's minimal value shall be added. optional, default: []
+ :type add_peak_min: list
- :param addPeakMax: List of column names which's maximal value shall be added to the
- typical periods. E.g. ['EDemand', 'HDemand']. optional, default: []
- :type addPeakMax: list
+ :param add_peak_max: List of column names which's maximal value shall be added. optional, default: []
+ :type add_peak_max: list
- :param addMeanMin: List of column names where the period with the cumulative minimal value
- shall be added to the typical periods. E.g. ['Photovoltaic']. optional, default: []
- :type addMeanMin: list
+ :param add_mean_min: List of column names where the period with the cumulative minimal value
+ shall be added. optional, default: []
+ :type add_mean_min: list
- :param addMeanMax: List of column names where the period with the cumulative maximal value
- shall be added to the typical periods. optional, default: []
- :type addMeanMax: list
+ :param add_mean_max: List of column names where the period with the cumulative maximal value
+ shall be added. optional, default: []
+ :type add_mean_max: list
"""
+ # Translate deprecated camelCase kwargs to snake_case
+ for old_name, new_name in _PARAM_ALIASES.items():
+ if old_name in kwargs:
+ warnings.warn(
+ f"'{old_name}' is deprecated, use '{new_name}'.",
+ FutureWarning,
+ stacklevel=2,
+ )
+ if new_name in kwargs:
+ raise TypeError(
+ f"Cannot specify both '{old_name}' and '{new_name}'"
+ )
+ kwargs[new_name] = kwargs.pop(old_name)
+
+ # Apply translated kwargs as overrides
+ time_series = kwargs.pop("time_series", time_series)
+ resolution = kwargs.pop("resolution", resolution)
+ no_typical_periods = kwargs.pop("no_typical_periods", no_typical_periods)
+ no_segments = kwargs.pop("no_segments", no_segments)
+ hours_per_period = kwargs.pop("hours_per_period", hours_per_period)
+ cluster_method = kwargs.pop("cluster_method", cluster_method)
+ eval_sum_periods = kwargs.pop("eval_sum_periods", eval_sum_periods)
+ sort_values = kwargs.pop("sort_values", sort_values)
+ same_mean = kwargs.pop("same_mean", same_mean)
+ rescale_cluster_periods = kwargs.pop(
+ "rescale_cluster_periods", rescale_cluster_periods
+ )
+ rescale_exclude_columns = kwargs.pop(
+ "rescale_exclude_columns", rescale_exclude_columns
+ )
+ weight_dict = kwargs.pop("weight_dict", weight_dict)
+ segmentation = kwargs.pop("segmentation", segmentation)
+ extreme_period_method = kwargs.pop(
+ "extreme_period_method", extreme_period_method
+ )
+ representation_method = kwargs.pop(
+ "representation_method", representation_method
+ )
+ representation_dict = kwargs.pop("representation_dict", representation_dict)
+ distribution_period_wise = kwargs.pop(
+ "distribution_period_wise", distribution_period_wise
+ )
+ segment_representation_method = kwargs.pop(
+ "segment_representation_method", segment_representation_method
+ )
+ predef_cluster_order = kwargs.pop("predef_cluster_order", predef_cluster_order)
+ predef_cluster_center_indices = kwargs.pop(
+ "predef_cluster_center_indices", predef_cluster_center_indices
+ )
+ predef_extreme_cluster_idx = kwargs.pop(
+ "predef_extreme_cluster_idx", predef_extreme_cluster_idx
+ )
+ predef_segment_order = kwargs.pop("predef_segment_order", predef_segment_order)
+ predef_segment_durations = kwargs.pop(
+ "predef_segment_durations", predef_segment_durations
+ )
+ predef_segment_centers = kwargs.pop(
+ "predef_segment_centers", predef_segment_centers
+ )
+ solver = kwargs.pop("solver", solver)
+ numerical_tolerance = kwargs.pop("numerical_tolerance", numerical_tolerance)
+ round_output = kwargs.pop("round_output", round_output)
+ add_peak_min = kwargs.pop("add_peak_min", add_peak_min)
+ add_peak_max = kwargs.pop("add_peak_max", add_peak_max)
+ add_mean_min = kwargs.pop("add_mean_min", add_mean_min)
+ add_mean_max = kwargs.pop("add_mean_max", add_mean_max)
+
+ if kwargs:
+ raise TypeError(f"Unexpected keyword arguments: {set(kwargs)}")
+
warnings.warn(
"TimeSeriesAggregation will be removed in tsam v4.0. "
"Use tsam.aggregate() instead. See the migration guide in the documentation.",
LegacyAPIWarning,
stacklevel=2,
)
- if addMeanMin is None:
- addMeanMin = []
- if addMeanMax is None:
- addMeanMax = []
- if addPeakMax is None:
- addPeakMax = []
- if addPeakMin is None:
- addPeakMin = []
- if weightDict is None:
- weightDict = {}
- self.timeSeries = timeSeries
+ if add_mean_min is None:
+ add_mean_min = []
+ if add_mean_max is None:
+ add_mean_max = []
+ if add_peak_max is None:
+ add_peak_max = []
+ if add_peak_min is None:
+ add_peak_min = []
+ if weight_dict is None:
+ weight_dict = {}
+ self.time_series = time_series
self.resolution = resolution
- self.hoursPerPeriod = hoursPerPeriod
+ self.hours_per_period = hours_per_period
- self.noTypicalPeriods = noTypicalPeriods
+ self.no_typical_periods = no_typical_periods
- self.noSegments = noSegments
+ self.no_segments = no_segments
- self.clusterMethod = clusterMethod
+ self.cluster_method = cluster_method
- self.extremePeriodMethod = extremePeriodMethod
+ self.extreme_period_method = extreme_period_method
- self.evalSumPeriods = evalSumPeriods
+ self.eval_sum_periods = eval_sum_periods
- self.sortValues = sortValues
+ self.sort_values = sort_values
- self.sameMean = sameMean
+ self.same_mean = same_mean
- self.rescaleClusterPeriods = rescaleClusterPeriods
+ self.rescale_cluster_periods = rescale_cluster_periods
- self.rescaleExcludeColumns = rescaleExcludeColumns or []
+ self.rescale_exclude_columns = rescale_exclude_columns or []
- self.weightDict = weightDict
+ self.weight_dict = weight_dict
- self.representationMethod = representationMethod
+ self.representation_method = representation_method
- self.representationDict = representationDict
+ self.representation_dict = representation_dict
- self.distributionPeriodWise = distributionPeriodWise
+ self.distribution_period_wise = distribution_period_wise
- self.segmentRepresentationMethod = segmentRepresentationMethod
+ self.segment_representation_method = segment_representation_method
- self.predefClusterOrder = predefClusterOrder
+ self.predef_cluster_order = predef_cluster_order
- self.predefClusterCenterIndices = predefClusterCenterIndices
+ self.predef_cluster_center_indices = predef_cluster_center_indices
- self.predefExtremeClusterIdx = predefExtremeClusterIdx
+ self.predef_extreme_cluster_idx = predef_extreme_cluster_idx
- self.predefSegmentOrder = predefSegmentOrder
+ self.predef_segment_order = predef_segment_order
- self.predefSegmentDurations = predefSegmentDurations
+ self.predef_segment_durations = predef_segment_durations
- self.predefSegmentCenters = predefSegmentCenters
+ self.predef_segment_centers = predef_segment_centers
self.solver = solver
- self.numericalTolerance = numericalTolerance
+ self.numerical_tolerance = numerical_tolerance
self.segmentation = segmentation
- self.roundOutput = roundOutput
+ self.round_output = round_output
- self.addPeakMin = addPeakMin
+ self.add_peak_min = add_peak_min
- self.addPeakMax = addPeakMax
+ self.add_peak_max = add_peak_max
- self.addMeanMin = addMeanMin
+ self.add_mean_min = add_mean_min
- self.addMeanMax = addMeanMax
+ self.add_mean_max = add_mean_max
self._check_init_args()
- # internal attributes
- self._normalizedMean = None
-
return
def _check_init_args(self):
- # check timeSeries and set it as pandas DataFrame
- if not isinstance(self.timeSeries, pd.DataFrame):
- if isinstance(self.timeSeries, dict) or isinstance(
- self.timeSeries, np.ndarray
+ # check time_series and set it as pandas DataFrame
+ if not isinstance(self.time_series, pd.DataFrame):
+ if isinstance(self.time_series, dict) or isinstance(
+ self.time_series, np.ndarray
):
- self.timeSeries = pd.DataFrame(self.timeSeries)
+ self.time_series = pd.DataFrame(self.time_series)
else:
raise ValueError(
- "timeSeries has to be of type pandas.DataFrame() "
+ "time_series has to be of type pandas.DataFrame() "
+ "or of type np.array() "
"in initialization of object of class " + type(self).__name__
)
# check if extreme periods exist in the dataframe
- for peak in self.addPeakMin:
- if peak not in self.timeSeries.columns:
+ for peak in self.add_peak_min:
+ if peak not in self.time_series.columns:
raise ValueError(
peak
- + ' listed in "addPeakMin"'
- + " does not occur as timeSeries column"
+ + ' listed in "add_peak_min"'
+ + " does not occur as time_series column"
)
- for peak in self.addPeakMax:
- if peak not in self.timeSeries.columns:
+ for peak in self.add_peak_max:
+ if peak not in self.time_series.columns:
raise ValueError(
peak
- + ' listed in "addPeakMax"'
- + " does not occur as timeSeries column"
+ + ' listed in "add_peak_max"'
+ + " does not occur as time_series column"
)
- for peak in self.addMeanMin:
- if peak not in self.timeSeries.columns:
+ for peak in self.add_mean_min:
+ if peak not in self.time_series.columns:
raise ValueError(
peak
- + ' listed in "addMeanMin"'
- + " does not occur as timeSeries column"
+ + ' listed in "add_mean_min"'
+ + " does not occur as time_series column"
)
- for peak in self.addMeanMax:
- if peak not in self.timeSeries.columns:
+ for peak in self.add_mean_max:
+ if peak not in self.time_series.columns:
raise ValueError(
peak
- + ' listed in "addMeanMax"'
- + " does not occur as timeSeries column"
+ + ' listed in "add_mean_max"'
+ + " does not occur as time_series column"
)
# derive resolution from date time index if not provided
if self.resolution is None:
try:
- timedelta = self.timeSeries.index[1] - self.timeSeries.index[0]
+ timedelta = self.time_series.index[1] - self.time_series.index[0]
self.resolution = float(timedelta.total_seconds()) / 3600
except AttributeError as exc:
raise ValueError(
@@ -429,8 +493,8 @@ def _check_init_args(self):
) from exc
except TypeError:
try:
- self.timeSeries.index = pd.to_datetime(self.timeSeries.index)
- timedelta = self.timeSeries.index[1] - self.timeSeries.index[0]
+ self.time_series.index = pd.to_datetime(self.time_series.index)
+ timedelta = self.time_series.index[1] - self.time_series.index[0]
self.resolution = float(timedelta.total_seconds()) / 3600
except Exception as exc:
raise ValueError(
@@ -441,1029 +505,474 @@ def _check_init_args(self):
if not (isinstance(self.resolution, int) or isinstance(self.resolution, float)):
raise ValueError("resolution has to be nonnegative float or int")
- # check hoursPerPeriod
- if self.hoursPerPeriod is None or self.hoursPerPeriod <= 0:
- raise ValueError("hoursPerPeriod has to be nonnegative float or int")
+ # check hours_per_period
+ if self.hours_per_period is None or self.hours_per_period <= 0:
+ raise ValueError("hours_per_period has to be nonnegative float or int")
# check typical Periods
if (
- self.noTypicalPeriods is None
- or self.noTypicalPeriods <= 0
- or not isinstance(self.noTypicalPeriods, int)
+ self.no_typical_periods is None
+ or self.no_typical_periods <= 0
+ or not isinstance(self.no_typical_periods, int)
):
- raise ValueError("noTypicalPeriods has to be nonnegative integer")
- self.timeStepsPerPeriod = int(self.hoursPerPeriod / self.resolution)
- if not self.timeStepsPerPeriod == self.hoursPerPeriod / self.resolution:
+ raise ValueError("no_typical_periods has to be nonnegative integer")
+ self.time_steps_per_period = int(self.hours_per_period / self.resolution)
+ if not self.time_steps_per_period == self.hours_per_period / self.resolution:
raise ValueError(
- "The combination of hoursPerPeriod and the "
- + "resulution does not result in an integer "
+ "The combination of hours_per_period and the "
+ + "resolution does not result in an integer "
+ "number of time steps per period"
)
if self.segmentation:
- if self.noSegments > self.timeStepsPerPeriod:
+ if self.no_segments > self.time_steps_per_period:
warnings.warn(
"The number of segments must be less than or equal to the number of time steps per period. "
"Segment number is decreased to number of time steps per period."
)
- self.noSegments = self.timeStepsPerPeriod
+ self.no_segments = self.time_steps_per_period
- # check clusterMethod
- if self.clusterMethod not in self.CLUSTER_METHODS:
+ # check cluster_method
+ if self.cluster_method not in self.CLUSTER_METHODS:
raise ValueError(
- "clusterMethod needs to be one of "
+ "cluster_method needs to be one of "
+ "the following: "
+ f"{self.CLUSTER_METHODS}"
)
- # check representationMethod
+ # check representation_method
if (
- self.representationMethod is not None
- and self.representationMethod not in self.REPRESENTATION_METHODS
+ self.representation_method is not None
+ and self.representation_method not in self.REPRESENTATION_METHODS
):
raise ValueError(
- "If specified, representationMethod needs to be one of "
+ "If specified, representation_method needs to be one of "
+ "the following: "
+ f"{self.REPRESENTATION_METHODS}"
)
- # check representationMethod
- if self.segmentRepresentationMethod is None:
- self.segmentRepresentationMethod = self.representationMethod
+ # check segment_representation_method
+ if self.segment_representation_method is None:
+ self.segment_representation_method = self.representation_method
else:
- if self.segmentRepresentationMethod not in self.REPRESENTATION_METHODS:
+ if self.segment_representation_method not in self.REPRESENTATION_METHODS:
raise ValueError(
- "If specified, segmentRepresentationMethod needs to be one of "
+ "If specified, segment_representation_method needs to be one of "
+ "the following: "
+ f"{self.REPRESENTATION_METHODS}"
)
- # if representationDict None, represent by maximum time steps in each cluster
- if self.representationDict is None:
- self.representationDict = dict.fromkeys(
- list(self.timeSeries.columns), "mean"
+ # if representation_dict None, represent by maximum time steps in each cluster
+ if self.representation_dict is None:
+ self.representation_dict = dict.fromkeys(
+ list(self.time_series.columns), "mean"
)
- # sort representationDict alphabetically to make sure that the min, max or mean function is applied to the right
+ # sort representation_dict alphabetically to make sure that the min, max or mean function is applied to the right
# column
- self.representationDict = (
- pd.Series(self.representationDict).sort_index(axis=0).to_dict()
+ self.representation_dict = (
+ pd.Series(self.representation_dict).sort_index(axis=0).to_dict()
)
- # check extremePeriods
- if self.extremePeriodMethod not in self.EXTREME_PERIOD_METHODS:
+ # check extreme_periods
+ if self.extreme_period_method not in self.EXTREME_PERIOD_METHODS:
raise ValueError(
- "extremePeriodMethod needs to be one of "
+ "extreme_period_method needs to be one of "
+ "the following: "
+ f"{self.EXTREME_PERIOD_METHODS}"
)
- # check evalSumPeriods
- if not isinstance(self.evalSumPeriods, bool):
- raise ValueError("evalSumPeriods has to be boolean")
- # check sortValues
- if not isinstance(self.sortValues, bool):
- raise ValueError("sortValues has to be boolean")
- # check sameMean
- if not isinstance(self.sameMean, bool):
- raise ValueError("sameMean has to be boolean")
- # check rescaleClusterPeriods
- if not isinstance(self.rescaleClusterPeriods, bool):
- raise ValueError("rescaleClusterPeriods has to be boolean")
-
- # check predefClusterOrder
- if self.predefClusterOrder is not None:
- if not isinstance(self.predefClusterOrder, (list, np.ndarray)):
- raise ValueError("predefClusterOrder has to be an array or list")
- if self.predefClusterCenterIndices is not None:
- # check predefClusterCenterIndices
- if not isinstance(self.predefClusterCenterIndices, (list, np.ndarray)):
+ # check eval_sum_periods
+ if not isinstance(self.eval_sum_periods, bool):
+ raise ValueError("eval_sum_periods has to be boolean")
+ # check sort_values
+ if not isinstance(self.sort_values, bool):
+ raise ValueError("sort_values has to be boolean")
+ # check same_mean
+ if not isinstance(self.same_mean, bool):
+ raise ValueError("same_mean has to be boolean")
+ # check rescale_cluster_periods
+ if not isinstance(self.rescale_cluster_periods, bool):
+ raise ValueError("rescale_cluster_periods has to be boolean")
+
+ # check predef_cluster_order
+ if self.predef_cluster_order is not None:
+ if not isinstance(self.predef_cluster_order, (list, np.ndarray)):
+ raise ValueError("predef_cluster_order has to be an array or list")
+ if self.predef_cluster_center_indices is not None:
+ # check predef_cluster_center_indices
+ if not isinstance(
+ self.predef_cluster_center_indices, (list, np.ndarray)
+ ):
raise ValueError(
- "predefClusterCenterIndices has to be an array or list"
+ "predef_cluster_center_indices has to be an array or list"
)
- elif self.predefClusterCenterIndices is not None:
+ elif self.predef_cluster_center_indices is not None:
raise ValueError(
- 'If "predefClusterCenterIndices" is defined, "predefClusterOrder" needs to be defined as well'
+ 'If "predef_cluster_center_indices" is defined, "predef_cluster_order" needs to be defined as well'
)
- # check predefSegmentOrder
- if self.predefSegmentOrder is not None:
- if not isinstance(self.predefSegmentOrder, (list, tuple)):
- raise ValueError("predefSegmentOrder has to be a list or tuple")
- if self.predefSegmentDurations is None:
+ # check predef_segment_order
+ if self.predef_segment_order is not None:
+ if not isinstance(self.predef_segment_order, (list, tuple)):
+ raise ValueError("predef_segment_order has to be a list or tuple")
+ if self.predef_segment_durations is None:
raise ValueError(
- 'If "predefSegmentOrder" is defined, "predefSegmentDurations" '
+ 'If "predef_segment_order" is defined, "predef_segment_durations" '
"needs to be defined as well"
)
- if not isinstance(self.predefSegmentDurations, (list, tuple)):
- raise ValueError("predefSegmentDurations has to be a list or tuple")
- elif self.predefSegmentDurations is not None:
+ if not isinstance(self.predef_segment_durations, (list, tuple)):
+ raise ValueError("predef_segment_durations has to be a list or tuple")
+ elif self.predef_segment_durations is not None:
raise ValueError(
- 'If "predefSegmentDurations" is defined, "predefSegmentOrder" '
+ 'If "predef_segment_durations" is defined, "predef_segment_order" '
"needs to be defined as well"
)
- if self.predefSegmentCenters is not None:
- if self.predefSegmentOrder is None:
+ if self.predef_segment_centers is not None:
+ if self.predef_segment_order is None:
raise ValueError(
- 'If "predefSegmentCenters" is defined, "predefSegmentOrder" '
+ 'If "predef_segment_centers" is defined, "predef_segment_order" '
"needs to be defined as well"
)
- if not isinstance(self.predefSegmentCenters, (list, tuple)):
- raise ValueError("predefSegmentCenters has to be a list or tuple")
+ if not isinstance(self.predef_segment_centers, (list, tuple)):
+ raise ValueError("predef_segment_centers has to be a list or tuple")
return
- def _normalizeTimeSeries(self, sameMean=False):
- """
- Normalizes each time series independently.
-
- :param sameMean: Decides if the time series should have all the same mean value.
- Relevant for weighting time series. optional (default: False)
- :type sameMean: boolean
-
- :returns: normalized time series
- """
- min_max_scaler = preprocessing.MinMaxScaler()
- normalizedTimeSeries = pd.DataFrame(
- min_max_scaler.fit_transform(self.timeSeries),
- columns=self.timeSeries.columns,
- index=self.timeSeries.index,
- )
-
- self._normalizedMean = normalizedTimeSeries.mean()
- if sameMean:
- normalizedTimeSeries /= self._normalizedMean
-
- return normalizedTimeSeries
-
- def _unnormalizeTimeSeries(self, normalizedTimeSeries, sameMean=False):
- """
- Equivalent to '_normalizeTimeSeries'. Just does the back
- transformation.
-
- :param normalizedTimeSeries: Time series which should get back transformated. required
- :type normalizedTimeSeries: pandas.DataFrame()
-
- :param sameMean: Has to have the same value as in _normalizeTimeSeries. optional (default: False)
- :type sameMean: boolean
-
- :returns: unnormalized time series
- """
- from sklearn import preprocessing
-
- min_max_scaler = preprocessing.MinMaxScaler()
- min_max_scaler.fit(self.timeSeries)
-
- if sameMean:
- normalizedTimeSeries *= self._normalizedMean
-
- unnormalizedTimeSeries = pd.DataFrame(
- min_max_scaler.inverse_transform(normalizedTimeSeries),
- columns=normalizedTimeSeries.columns,
- index=normalizedTimeSeries.index,
- )
-
- return unnormalizedTimeSeries
-
- def _preProcessTimeSeries(self):
- """
- Normalize the time series, weight them based on the weight dict and
- puts them into the correct matrix format.
- """
- # first sort the time series in order to avoid bug mention in #18
- self.timeSeries.sort_index(axis=1, inplace=True)
-
- # convert the dataframe to floats
- self.timeSeries = self.timeSeries.astype(float)
-
- # normalize the time series and group them to periodly profiles
- self.normalizedTimeSeries = self._normalizeTimeSeries(sameMean=self.sameMean)
-
- for column in self.weightDict:
- if self.weightDict[column] < MIN_WEIGHT:
- print(
- 'weight of "'
- + str(column)
- + '" set to the minmal tolerable weighting'
- )
- self.weightDict[column] = MIN_WEIGHT
- self.normalizedTimeSeries[column] = (
- self.normalizedTimeSeries[column] * self.weightDict[column]
+ def _translate_representation(self, method=None):
+ """Map old representation_method to new API representation."""
+ if method is None:
+ method = self.representation_method
+ if method is None:
+ return None
+ if method in ("distributionRepresentation", "durationRepresentation"):
+ return Distribution(
+ scope="cluster" if self.distribution_period_wise else "global"
)
-
- with warnings.catch_warnings():
- warnings.simplefilter("ignore", LegacyAPIWarning)
- self.normalizedPeriodlyProfiles, self.timeIndex = unstackToPeriods(
- self.normalizedTimeSeries, self.timeStepsPerPeriod
+ if method == "distributionAndMinMaxRepresentation":
+ return Distribution(
+ scope="cluster" if self.distribution_period_wise else "global",
+ preserve_minmax=True,
)
-
- # check if no NaN is in the resulting profiles
- if self.normalizedPeriodlyProfiles.isnull().values.any():
- raise ValueError(
- "Pre processed data includes NaN. Please check the timeSeries input data."
- )
-
- def _postProcessTimeSeries(self, normalizedTimeSeries, applyWeighting=True):
- """
- Neutralizes the weighting the time series back and unnormalizes them.
- """
- if applyWeighting:
- for column in self.weightDict:
- normalizedTimeSeries[column] = (
- normalizedTimeSeries[column] / self.weightDict[column]
- )
-
- unnormalizedTimeSeries = self._unnormalizeTimeSeries(
- normalizedTimeSeries, sameMean=self.sameMean
+ if method == "minmaxmeanRepresentation":
+ max_cols = [c for c, r in self.representation_dict.items() if r == "max"]
+ min_cols = [c for c, r in self.representation_dict.items() if r == "min"]
+ return MinMaxMean(max_columns=max_cols, min_columns=min_cols)
+ return _REPR_METHOD_MAP.get(method)
+
+ def _build_pipeline_config(self) -> PipelineConfig:
+ """Build PipelineConfig from old-API parameters."""
+ cluster = ClusterConfig(
+ method=_CLUSTER_METHOD_MAP[self.cluster_method], # type: ignore[arg-type]
+ representation=self._translate_representation(),
+ weights=self.weight_dict if self.weight_dict else None,
+ scale_by_column_means=self.same_mean,
+ use_duration_curves=self.sort_values,
+ include_period_sums=self.eval_sum_periods,
+ solver=self.solver,
)
- if self.roundOutput is not None:
- unnormalizedTimeSeries = unnormalizedTimeSeries.round(
- decimals=self.roundOutput
+ extremes = None
+ if self.extreme_period_method != "None":
+ extremes = ExtremeConfig(
+ method=_EXTREME_METHOD_MAP[self.extreme_period_method], # type: ignore[arg-type]
+ max_value=list(self.add_peak_max),
+ min_value=list(self.add_peak_min),
+ max_period=list(self.add_mean_max),
+ min_period=list(self.add_mean_min),
)
+ if not extremes.has_extremes():
+ extremes = None
- return unnormalizedTimeSeries
-
- def _addExtremePeriods(
- self,
- groupedSeries,
- clusterCenters,
- clusterOrder,
- extremePeriodMethod="new_cluster_center",
- addPeakMin=None,
- addPeakMax=None,
- addMeanMin=None,
- addMeanMax=None,
- ):
- """
- Adds different extreme periods based on the to the clustered data,
- decribed by the clusterCenters and clusterOrder.
-
- :param groupedSeries: periodly grouped groupedSeries on which basis it should be decided,
- which period is an extreme period. required
- :type groupedSeries: pandas.DataFrame()
-
- :param clusterCenters: Output from clustering with sklearn. required
- :type clusterCenters: dict
-
- :param clusterOrder: Output from clsutering with sklearn. required
- :type clusterOrder: dict
-
- :param extremePeriodMethod: Chosen extremePeriodMethod. The method. optional(default: 'new_cluster_center' )
- :type extremePeriodMethod: string
-
- :returns: - **newClusterCenters** -- The new cluster centers extended with the extreme periods.
- - **newClusterOrder** -- The new cluster order including the extreme periods.
- - **extremeClusterIdx** -- A list of indices where in the newClusterCenters are the extreme
- periods located.
- """
-
- # init required dicts and lists
- self.extremePeriods = {}
- extremePeriodNo = []
-
- ccList = [center.tolist() for center in clusterCenters]
-
- # check which extreme periods exist in the profile and add them to
- # self.extremePeriods dict
- for column in self.timeSeries.columns:
- if column in addPeakMax:
- stepNo = groupedSeries[column].max(axis=1).idxmax()
- # add only if stepNo is not already in extremePeriods
- # if it is not already a cluster center
- if (
- stepNo not in extremePeriodNo
- and groupedSeries.loc[stepNo, :].values.tolist() not in ccList
- ):
- max_col = self._append_col_with(column, " max.")
- self.extremePeriods[max_col] = {
- "stepNo": stepNo,
- "profile": groupedSeries.loc[stepNo, :].values,
- "column": column,
- }
- extremePeriodNo.append(stepNo)
-
- if column in addPeakMin:
- stepNo = groupedSeries[column].min(axis=1).idxmin()
- # add only if stepNo is not already in extremePeriods
- # if it is not already a cluster center
- if (
- stepNo not in extremePeriodNo
- and groupedSeries.loc[stepNo, :].values.tolist() not in ccList
- ):
- min_col = self._append_col_with(column, " min.")
- self.extremePeriods[min_col] = {
- "stepNo": stepNo,
- "profile": groupedSeries.loc[stepNo, :].values,
- "column": column,
- }
- extremePeriodNo.append(stepNo)
-
- if column in addMeanMax:
- stepNo = groupedSeries[column].mean(axis=1).idxmax()
- # add only if stepNo is not already in extremePeriods
- # if it is not already a cluster center
- if (
- stepNo not in extremePeriodNo
- and groupedSeries.loc[stepNo, :].values.tolist() not in ccList
- ):
- mean_max_col = self._append_col_with(column, " daily max.")
- self.extremePeriods[mean_max_col] = {
- "stepNo": stepNo,
- "profile": groupedSeries.loc[stepNo, :].values,
- "column": column,
- }
- extremePeriodNo.append(stepNo)
-
- if column in addMeanMin:
- stepNo = groupedSeries[column].mean(axis=1).idxmin()
- # add only if stepNo is not already in extremePeriods and
- # if it is not already a cluster center
- if (
- stepNo not in extremePeriodNo
- and groupedSeries.loc[stepNo, :].values.tolist() not in ccList
- ):
- mean_min_col = self._append_col_with(column, " daily min.")
- self.extremePeriods[mean_min_col] = {
- "stepNo": stepNo,
- "profile": groupedSeries.loc[stepNo, :].values,
- "column": column,
- }
- extremePeriodNo.append(stepNo)
-
- for periodType in self.extremePeriods:
- # get current related clusters of extreme periods
- self.extremePeriods[periodType]["clusterNo"] = clusterOrder[
- self.extremePeriods[periodType]["stepNo"]
- ]
-
- # init new cluster structure
- newClusterCenters = []
- newClusterOrder = clusterOrder
- extremeClusterIdx = []
-
- # integrate extreme periods to clusters
- if extremePeriodMethod == "append":
- # attach extreme periods to cluster centers
- for i, cluster_center in enumerate(clusterCenters):
- newClusterCenters.append(cluster_center)
- for i, periodType in enumerate(self.extremePeriods):
- extremeClusterIdx.append(len(newClusterCenters))
- newClusterCenters.append(self.extremePeriods[periodType]["profile"])
- newClusterOrder[self.extremePeriods[periodType]["stepNo"]] = i + len(
- clusterCenters
- )
-
- elif extremePeriodMethod == "new_cluster_center":
- for i, cluster_center in enumerate(clusterCenters):
- newClusterCenters.append(cluster_center)
- # attach extrem periods to cluster centers and consider for all periods
- # if the fit better to the cluster or the extrem period
- for i, periodType in enumerate(self.extremePeriods):
- extremeClusterIdx.append(len(newClusterCenters))
- newClusterCenters.append(self.extremePeriods[periodType]["profile"])
- self.extremePeriods[periodType]["newClusterNo"] = i + len(
- clusterCenters
- )
-
- for i, cPeriod in enumerate(newClusterOrder):
- # caclulate euclidean distance to cluster center
- cluster_dist = sum(
- (groupedSeries.iloc[i].values - clusterCenters[cPeriod]) ** 2
- )
- for ii, extremPeriodType in enumerate(self.extremePeriods):
- # exclude other extreme periods from adding to the new
- # cluster center
- isOtherExtreme = False
- for otherExPeriod in self.extremePeriods:
- if (
- i == self.extremePeriods[otherExPeriod]["stepNo"]
- and otherExPeriod != extremPeriodType
- ):
- isOtherExtreme = True
- # calculate distance to extreme periods
- extperiod_dist = sum(
- (
- groupedSeries.iloc[i].values
- - self.extremePeriods[extremPeriodType]["profile"]
- )
- ** 2
- )
- # choose new cluster relation
- if extperiod_dist < cluster_dist and not isOtherExtreme:
- newClusterOrder[i] = self.extremePeriods[extremPeriodType][
- "newClusterNo"
- ]
-
- elif extremePeriodMethod == "replace_cluster_center":
- # Worst Case Clusterperiods
- newClusterCenters = clusterCenters
- for periodType in self.extremePeriods:
- index = groupedSeries.columns.get_loc(
- self.extremePeriods[periodType]["column"]
- )
- newClusterCenters[self.extremePeriods[periodType]["clusterNo"]][
- index
- ] = self.extremePeriods[periodType]["profile"][index]
- if (
- self.extremePeriods[periodType]["clusterNo"]
- not in extremeClusterIdx
- ):
- extremeClusterIdx.append(
- self.extremePeriods[periodType]["clusterNo"]
- )
-
- return newClusterCenters, newClusterOrder, extremeClusterIdx
-
- def _append_col_with(self, column, append_with=" max."):
- """Appends a string to the column name. For MultiIndexes, which turn out to be
- tuples when this method is called, only last level is changed"""
- if isinstance(column, str):
- return column + append_with
- elif isinstance(column, tuple):
- col = list(column)
- col[-1] = col[-1] + append_with
- return tuple(col)
-
- def _rescaleClusterPeriods(self, clusterOrder, clusterPeriods, extremeClusterIdx):
- """
- Rescale the values of the clustered Periods such that mean of each time
- series in the typical Periods fits the mean value of the original time
- series, without changing the values of the extremePeriods.
- """
- # Initialize dict to store rescaling deviations per column
- self._rescaleDeviations = {}
-
- weightingVec = pd.Series(self._clusterPeriodNoOccur).values
- columns = list(self.timeSeries.columns)
- n_clusters = len(self.clusterPeriods)
- n_cols = len(columns)
- n_timesteps = self.timeStepsPerPeriod
-
- # Convert to 3D numpy array for fast operations: (n_clusters, n_cols, n_timesteps)
- arr = np.array(self.clusterPeriods).reshape(n_clusters, n_cols, n_timesteps)
-
- # Indices for non-extreme clusters
- idx_wo_peak = np.delete(np.arange(n_clusters), extremeClusterIdx)
- extremeClusterIdx_arr = np.array(extremeClusterIdx, dtype=int)
-
- for ci, column in enumerate(columns):
- # Skip columns excluded from rescaling
- if column in self.rescaleExcludeColumns:
- continue
-
- col_data = arr[:, ci, :] # (n_clusters, n_timesteps)
- sum_raw = self.normalizedPeriodlyProfiles[column].sum().sum()
-
- # Sum of extreme periods (weighted)
- if len(extremeClusterIdx_arr) > 0:
- sum_peak = np.sum(
- weightingVec[extremeClusterIdx_arr]
- * col_data[extremeClusterIdx_arr, :].sum(axis=1)
- )
- else:
- sum_peak = 0.0
-
- sum_clu_wo_peak = np.sum(
- weightingVec[idx_wo_peak] * col_data[idx_wo_peak, :].sum(axis=1)
+ segments = None
+ if self.segmentation:
+ seg_repr = self._translate_representation(
+ self.segment_representation_method
+ )
+ segments = SegmentConfig(
+ n_segments=self.no_segments,
+ representation=seg_repr if seg_repr is not None else "mean",
)
- # define the upper scale dependent on the weighting of the series
- scale_ub = 1.0
- if self.sameMean:
- scale_ub = (
- scale_ub
- * self.timeSeries[column].max()
- / self.timeSeries[column].mean()
- )
- if column in self.weightDict:
- scale_ub = scale_ub * self.weightDict[column]
-
- # difference between predicted and original sum
- diff = abs(sum_raw - (sum_clu_wo_peak + sum_peak))
-
- # use while loop to rescale cluster periods
- a = 0
- while diff > sum_raw * TOLERANCE and a < MAX_ITERATOR:
- # rescale values (only non-extreme clusters)
- arr[idx_wo_peak, ci, :] *= (sum_raw - sum_peak) / sum_clu_wo_peak
-
- # reset values higher than the upper scale or less than zero
- arr[:, ci, :] = np.clip(arr[:, ci, :], 0, scale_ub)
-
- # Handle NaN (replace with 0)
- np.nan_to_num(arr[:, ci, :], copy=False, nan=0.0)
-
- # calc new sum and new diff to orig data
- col_data = arr[:, ci, :]
- sum_clu_wo_peak = np.sum(
- weightingVec[idx_wo_peak] * col_data[idx_wo_peak, :].sum(axis=1)
- )
- diff = abs(sum_raw - (sum_clu_wo_peak + sum_peak))
- a += 1
-
- # Calculate and store final deviation
- deviation_pct = (diff / sum_raw) * 100 if sum_raw != 0 else 0.0
- converged = a < MAX_ITERATOR
- self._rescaleDeviations[column] = {
- "deviation_pct": deviation_pct,
- "converged": converged,
- "iterations": a,
- }
-
- if not converged and deviation_pct > 0.01:
- warnings.warn(
- 'Max iteration number reached for "'
- + str(column)
- + '" while rescaling the cluster periods.'
- + " The integral of the aggregated time series deviates by: "
- + str(round(deviation_pct, 2))
- + "%"
- )
-
- # Reshape back to 2D: (n_clusters, n_cols * n_timesteps)
- return arr.reshape(n_clusters, -1)
-
- def _clusterSortedPeriods(
- self, candidates, n_init=20, n_clusters=None, delClusterParams=None
- ):
- """
- Runs the clustering algorithms for the sorted profiles within the period
- instead of the original profiles. (Duration curve clustering)
- """
- # Strip extra evaluation columns for representation
- repr_candidates = (
- candidates[:, :delClusterParams] if delClusterParams else candidates
- )
-
- # Vectorized sort: reshape to 3D (periods x columns x timesteps), sort, reshape back
- values = self.normalizedPeriodlyProfiles.values.copy()
- n_periods, n_total = values.shape
- n_cols = len(self.timeSeries.columns)
- n_timesteps = n_total // n_cols
-
- # Sort each period's timesteps descending for all columns at once
- # Use stable sort for deterministic tie-breaking across environments
- values_3d = values.reshape(n_periods, n_cols, n_timesteps)
- sortedClusterValues = (-np.sort(-values_3d, axis=2, kind="stable")).reshape(
- n_periods, -1
- )
+ predef = None
+ if self.predef_cluster_order is not None:
+ predef = PredefParams(
+ cluster_order=list(self.predef_cluster_order),
+ cluster_center_indices=(
+ list(self.predef_cluster_center_indices)
+ if self.predef_cluster_center_indices is not None
+ else None
+ ),
+ extreme_cluster_idx=(
+ list(self.predef_extreme_cluster_idx)
+ if self.predef_extreme_cluster_idx is not None
+ else None
+ ),
+ segment_order=(
+ [list(s) for s in self.predef_segment_order]
+ if self.predef_segment_order is not None
+ else None
+ ),
+ segment_durations=(
+ [list(s) for s in self.predef_segment_durations]
+ if self.predef_segment_durations is not None
+ else None
+ ),
+ segment_centers=(
+ [list(s) for s in self.predef_segment_centers]
+ if self.predef_segment_centers is not None
+ else None
+ ),
+ )
- if n_clusters is None:
- n_clusters = self.noTypicalPeriods
-
- (
- _altClusterCenters,
- self.clusterCenterIndices,
- clusterOrders_C,
- ) = aggregatePeriods(
- sortedClusterValues,
- n_clusters=n_clusters,
- n_iter=30,
- solver=self.solver,
- clusterMethod=self.clusterMethod,
- representationMethod=self.representationMethod,
- representationDict=self.representationDict,
- distributionPeriodWise=self.distributionPeriodWise,
- timeStepsPerPeriod=self.timeStepsPerPeriod,
+ return PipelineConfig(
+ n_clusters=self.no_typical_periods,
+ n_timesteps_per_period=self.time_steps_per_period,
+ cluster=cluster,
+ extremes=extremes,
+ segments=segments,
+ rescale_cluster_periods=self.rescale_cluster_periods,
+ rescale_exclude_columns=self.rescale_exclude_columns or None,
+ round_decimals=self.round_output,
+ numerical_tolerance=self.numerical_tolerance,
+ temporal_resolution=self.resolution,
+ predef=predef,
)
- clusterCenters_C = []
-
- # take the clusters and determine the most representative sorted
- # period as cluster center
- for clusterNum in np.unique(clusterOrders_C):
- indice = np.where(clusterOrders_C == clusterNum)[0]
- if len(indice) > 1:
- # mean value for each time step for each time series over
- # all Periods in the cluster
- currentMean_C = sortedClusterValues[indice].mean(axis=0)
- # index of the period with the lowest distance to the cluster
- # center
- mindistIdx_C = np.argmin(
- np.square(sortedClusterValues[indice] - currentMean_C).sum(axis=1)
- )
- # append original time series of this period (without extra eval columns)
- medoid_C = repr_candidates[indice][mindistIdx_C]
-
- # append to cluster center
- clusterCenters_C.append(medoid_C)
-
- else:
- # if only on period is part of the cluster, add this index
- clusterCenters_C.append(repr_candidates[indice][0])
-
- return clusterCenters_C, clusterOrders_C
-
- def createTypicalPeriods(self):
+ def create_typical_periods(self):
"""
Clusters the Periods.
- :returns: **self.typicalPeriods** -- All typical Periods in scaled form.
+ :returns: **self.typical_periods** -- All typical Periods in scaled form.
"""
- self._preProcessTimeSeries()
-
- # Compute effective number of clusters for the clustering algorithm
- effective_n_clusters = self.noTypicalPeriods
+ # Sort + cast (matches old _pre_process_time_series)
+ self.time_series.sort_index(axis=1, inplace=True)
+ self.time_series = self.time_series.astype(float)
- # check for additional cluster parameters
- if self.evalSumPeriods:
- evaluationValues = (
- self.normalizedPeriodlyProfiles.stack(future_stack=True, level=0)
- .sum(axis=1)
- .unstack(level=1)
- )
- # how many values have to get deleted later
- delClusterParams = -len(evaluationValues.columns)
- candidates = np.concatenate(
- (self.normalizedPeriodlyProfiles.values, evaluationValues.values),
- axis=1,
- )
- else:
- delClusterParams = None
- candidates = self.normalizedPeriodlyProfiles.values
-
- # skip aggregation procedure for the case of a predefined cluster sequence and get only the correct representation
- if self.predefClusterOrder is not None:
- self._clusterOrder = self.predefClusterOrder
- # check if representatives are defined
- if self.predefClusterCenterIndices is not None:
- self.clusterCenterIndices = self.predefClusterCenterIndices
- repr_candidates = (
- candidates[:, :delClusterParams] if delClusterParams else candidates
- )
- self.clusterCenters = repr_candidates[self.predefClusterCenterIndices]
- else:
- # otherwise take the medoids (strip extra eval columns)
- repr_candidates = (
- candidates[:, :delClusterParams] if delClusterParams else candidates
- )
- self.clusterCenters, self.clusterCenterIndices = representations(
- repr_candidates,
- self._clusterOrder,
- default="medoidRepresentation",
- representationMethod=self.representationMethod,
- representationDict=self.representationDict,
- timeStepsPerPeriod=self.timeStepsPerPeriod,
- )
- else:
- cluster_duration = time.time()
- if not self.sortValues:
- # cluster the data
- (
- self.clusterCenters,
- self.clusterCenterIndices,
- self._clusterOrder,
- ) = aggregatePeriods(
- candidates,
- n_clusters=effective_n_clusters,
- n_iter=100,
- solver=self.solver,
- clusterMethod=self.clusterMethod,
- representationMethod=self.representationMethod,
- representationDict=self.representationDict,
- distributionPeriodWise=self.distributionPeriodWise,
- timeStepsPerPeriod=self.timeStepsPerPeriod,
- n_extra_columns=-delClusterParams if delClusterParams else 0,
- )
- else:
- self.clusterCenters, self._clusterOrder = self._clusterSortedPeriods(
- candidates,
- n_clusters=effective_n_clusters,
- delClusterParams=delClusterParams,
- )
- self.clusteringDuration = time.time() - cluster_duration
-
- # All paths now produce cluster centers without extra evaluation columns,
- # so no stripping is needed.
- self.clusterPeriods = list(self.clusterCenters)
-
- if not self.extremePeriodMethod == "None":
- (
- self.clusterPeriods,
- self._clusterOrder,
- self.extremeClusterIdx,
- ) = self._addExtremePeriods(
- self.normalizedPeriodlyProfiles,
- self.clusterPeriods,
- self._clusterOrder,
- extremePeriodMethod=self.extremePeriodMethod,
- addPeakMin=self.addPeakMin,
- addPeakMax=self.addPeakMax,
- addMeanMin=self.addMeanMin,
- addMeanMax=self.addMeanMax,
- )
- else:
- # Use predefined extreme cluster indices if provided (for transfer/apply)
- if self.predefExtremeClusterIdx is not None:
- self.extremeClusterIdx = list(self.predefExtremeClusterIdx)
- else:
- self.extremeClusterIdx = []
-
- # get number of appearance of the the typical periods
- nums, counts = np.unique(self._clusterOrder, return_counts=True)
- self._clusterPeriodNoOccur = {num: counts[ii] for ii, num in enumerate(nums)}
-
- if self.rescaleClusterPeriods:
- self.clusterPeriods = self._rescaleClusterPeriods(
- self._clusterOrder, self.clusterPeriods, self.extremeClusterIdx
- )
-
- # if additional time steps have been added, reduce the number of occurrence of the typical period
- # which is related to these time steps
- if not len(self.timeSeries) % self.timeStepsPerPeriod == 0:
- self._clusterPeriodNoOccur[self._clusterOrder[-1]] -= (
- 1
- - float(len(self.timeSeries) % self.timeStepsPerPeriod)
- / self.timeStepsPerPeriod
+ # NaN check (must happen before pipeline, same error message)
+ if self.time_series.isnull().values.any():
+ raise ValueError(
+ "Pre processed data includes NaN. Please check the time_series input data."
)
- # put the clustered data in pandas format and scale back
- self.normalizedTypicalPeriods = (
- pd.concat(
- [
- pd.Series(s, index=self.normalizedPeriodlyProfiles.columns)
- for s in self.clusterPeriods
- ],
- axis=1,
- )
- .unstack("TimeStep")
- .T
+ # Validate weights before pipeline
+ validated = validate_weights(self.time_series.columns, self.weight_dict or None)
+ if validated is not None:
+ self.weight_dict = validated
+
+ # Run pipeline
+ cfg = self._build_pipeline_config()
+ result = run_pipeline(data=self.time_series, cfg=cfg)
+
+ # Extract state for properties and other methods
+ self._pipeline_result = result
+ self._cluster_order = np.array(result.clustering_result.cluster_assignments)
+ self._cluster_period_no_occur = result.cluster_counts
+ self.cluster_center_indices = (
+ list(result.clustering_result.cluster_centers)
+ if result.clustering_result.cluster_centers is not None
+ else None
+ )
+ self.extreme_cluster_idx = (
+ list(result.clustering_result.extreme_cluster_indices)
+ if result.clustering_result.extreme_cluster_indices is not None
+ else []
)
+ self.clustering_duration = result.clustering_duration
+ self.time_index = result.time_index
- if self.segmentation:
- from tsam.utils.segmentation import segmentation
-
- (
- self.segmentedNormalizedTypicalPeriods,
- self.predictedSegmentedNormalizedTypicalPeriods,
- self.segmentCenterIndices,
- ) = segmentation(
- self.normalizedTypicalPeriods,
- self.noSegments,
- self.timeStepsPerPeriod,
- representationMethod=self.segmentRepresentationMethod,
- representationDict=self.representationDict,
- distributionPeriodWise=self.distributionPeriodWise,
- predefSegmentOrder=self.predefSegmentOrder,
- predefSegmentDurations=self.predefSegmentDurations,
- predefSegmentCenters=self.predefSegmentCenters,
- )
- self.normalizedTypicalPeriods = (
- self.segmentedNormalizedTypicalPeriods.reset_index(level=3, drop=True)
- )
+ # Segmentation data
+ if self.segmentation and result.segmented_df is not None:
+ self.segmented_normalized_typical_periods = result.segmented_df
- self.typicalPeriods = self._postProcessTimeSeries(self.normalizedTypicalPeriods)
+ # typical_periods: alphabetically sorted columns (old API contract)
+ self.typical_periods = result.typical_periods.sort_index(axis=1)
- # check if original time series boundaries are not exceeded
- exceeds_max = self.typicalPeriods.max(axis=0) > self.timeSeries.max(axis=0)
- if exceeds_max.any():
- diff = self.typicalPeriods.max(axis=0) - self.timeSeries.max(axis=0)
- exceeding_diff = diff[exceeds_max]
- if exceeding_diff.max() > self.numericalTolerance:
- warnings.warn(
- "At least one maximal value of the "
- + "aggregated time series exceeds the maximal value "
- + "the input time series for: "
- + f"{exceeding_diff.to_dict()}"
- + ". To silence the warning set the 'numericalTolerance' to a higher value."
- )
- below_min = self.typicalPeriods.min(axis=0) < self.timeSeries.min(axis=0)
- if below_min.any():
- diff = self.timeSeries.min(axis=0) - self.typicalPeriods.min(axis=0)
- exceeding_diff = diff[below_min]
- if exceeding_diff.max() > self.numericalTolerance:
- warnings.warn(
- "Something went wrong... At least one minimal value of the "
- + "aggregated time series exceeds the minimal value "
- + "the input time series for: "
- + f"{exceeding_diff.to_dict()}"
- + ". To silence the warning set the 'numericalTolerance' to a higher value."
- )
- return self.typicalPeriods
+ return self.typical_periods
- def prepareEnersysInput(self):
+ def prepare_enersys_input(self):
"""
Creates all dictionaries and lists which are required for the energy system
optimization input.
"""
warnings.warn(
- '"prepareEnersysInput" is deprecated, since the created attributes can be directly accessed as properties',
+ '"prepare_enersys_input" is deprecated, since the created attributes can be directly accessed as properties',
DeprecationWarning,
)
return
@property
- def stepIdx(self):
+ def step_idx(self):
"""
Index inside a single cluster
"""
if self.segmentation:
- return [ix for ix in range(0, self.noSegments)]
+ return [ix for ix in range(0, self.no_segments)]
else:
- return [ix for ix in range(0, self.timeStepsPerPeriod)]
+ return [ix for ix in range(0, self.time_steps_per_period)]
@property
- def clusterPeriodIdx(self):
+ def cluster_period_idx(self):
"""
Index of the clustered periods
"""
- if not hasattr(self, "clusterOrder"):
- self.createTypicalPeriods()
- return np.sort(np.unique(self._clusterOrder))
+ if not hasattr(self, "_cluster_order"):
+ self.create_typical_periods()
+ return np.sort(np.unique(self._cluster_order))
@property
- def clusterOrder(self):
+ def cluster_order(self):
"""
The sequence/order of the typical period to represent
the original time series
"""
- if not hasattr(self, "_clusterOrder"):
- self.createTypicalPeriods()
- return self._clusterOrder
+ if not hasattr(self, "_cluster_order"):
+ self.create_typical_periods()
+ return self._cluster_order
@property
- def clusterPeriodNoOccur(self):
+ def cluster_period_no_occur(self):
"""
How often does a typical period occur in the original time series
"""
- if not hasattr(self, "clusterOrder"):
- self.createTypicalPeriods()
- return self._clusterPeriodNoOccur
+ if not hasattr(self, "_cluster_order"):
+ self.create_typical_periods()
+ return self._cluster_period_no_occur
@property
- def clusterPeriodDict(self):
+ def cluster_period_dict(self):
"""
Time series data for each period index as dictionary
"""
- if not hasattr(self, "_clusterOrder"):
- self.createTypicalPeriods()
- if not hasattr(self, "_clusterPeriodDict"):
- self._clusterPeriodDict = {}
- for column in self.typicalPeriods:
- self._clusterPeriodDict[column] = self.typicalPeriods[column].to_dict()
- return self._clusterPeriodDict
+ if not hasattr(self, "_cluster_order"):
+ self.create_typical_periods()
+ if not hasattr(self, "_cluster_period_dict"):
+ self._cluster_period_dict = {}
+ for column in self.typical_periods:
+ self._cluster_period_dict[column] = self.typical_periods[
+ column
+ ].to_dict()
+ return self._cluster_period_dict
@property
- def segmentDurationDict(self):
+ def segment_duration_dict(self):
"""
Segment duration in time steps for each period index as dictionary
"""
- if not hasattr(self, "_clusterOrder"):
- self.createTypicalPeriods()
- if not hasattr(self, "_segmentDurationDict"):
+ if not hasattr(self, "_cluster_order"):
+ self.create_typical_periods()
+ if not hasattr(self, "_segment_duration_dict"):
if self.segmentation:
- self._segmentDurationDict = (
- self.segmentedNormalizedTypicalPeriods.drop(
- self.segmentedNormalizedTypicalPeriods.columns, axis=1
+ self._segment_duration_dict = (
+ self.segmented_normalized_typical_periods.drop(
+ self.segmented_normalized_typical_periods.columns, axis=1
)
.reset_index(level=3, drop=True)
.reset_index(2)
.to_dict()
)
else:
- self._segmentDurationDict = self.typicalPeriods.drop(
- self.typicalPeriods.columns, axis=1
+ self._segment_duration_dict = self.typical_periods.drop(
+ self.typical_periods.columns, axis=1
)
- self._segmentDurationDict["Segment Duration"] = 1
- self._segmentDurationDict = self._segmentDurationDict.to_dict()
+ self._segment_duration_dict["Segment Duration"] = 1
+ self._segment_duration_dict = self._segment_duration_dict.to_dict()
warnings.warn(
"Segmentation is turned off. All segments are consistent the time steps."
)
- return self._segmentDurationDict
+ return self._segment_duration_dict
- def predictOriginalData(self):
+ def predict_original_data(self):
"""
Predicts the overall time series if every period would be placed in the
related cluster center
- :returns: **predictedData** (pandas.DataFrame) -- DataFrame which has the same shape as the original one.
+ :returns: **predicted_data** (pandas.DataFrame) -- DataFrame which has the same shape as the original one.
"""
- if not hasattr(self, "_clusterOrder"):
- self.createTypicalPeriods()
-
- # Select typical periods source based on segmentation
- if self.segmentation:
- typical = self.predictedSegmentedNormalizedTypicalPeriods
- else:
- typical = self.normalizedTypicalPeriods
-
- from tsam.config import _expand_periods
-
- clustered_data_df = _expand_periods(typical, tuple(self._clusterOrder))
-
- # back in form
- self.normalizedPredictedData = pd.DataFrame(
- clustered_data_df.values[: len(self.timeSeries)],
- index=self.timeSeries.index,
- columns=self.timeSeries.columns,
+ if not hasattr(self, "_pipeline_result"):
+ self.create_typical_periods()
+ self.predicted_data = self._pipeline_result.reconstructed_data.sort_index(
+ axis=1
)
- # For the non-segmentation path, normalizedTypicalPeriods was already
- # unweighted and sameMean-reversed in-place by createTypicalPeriods →
- # _postProcessTimeSeries. We must undo the sameMean in-place change
- # so _unnormalizeTimeSeries can re-apply it during inverse transform.
- #
- # For the segmentation path, predictedSegmentedNormalizedTypicalPeriods
- # was NOT modified in-place, so it still carries weights and sameMean.
- # We pass applyWeighting=True so _postProcessTimeSeries removes them.
- if self.segmentation:
- self.predictedData = self._postProcessTimeSeries(
- self.normalizedPredictedData, applyWeighting=True
- )
- else:
- if self.sameMean:
- self.normalizedPredictedData /= self._normalizedMean
- self.predictedData = self._postProcessTimeSeries(
- self.normalizedPredictedData, applyWeighting=False
- )
-
- return self.predictedData
+ return self.predicted_data
- def indexMatching(self):
+ def index_matching(self):
"""
Relates the index of the original time series with the indices
represented by the clusters
- :returns: **timeStepMatching** (pandas.DataFrame) -- DataFrame which has the same shape as the original one.
+ :returns: **time_step_matching** (pandas.DataFrame) -- DataFrame which has the same shape as the original one.
"""
- if not hasattr(self, "_clusterOrder"):
- self.createTypicalPeriods()
+ if not hasattr(self, "_cluster_order"):
+ self.create_typical_periods()
# create aggregated period and time step index lists
- periodIndex = []
- stepIndex = []
- for label in self._clusterOrder:
- for step in range(self.timeStepsPerPeriod):
- periodIndex.append(label)
- stepIndex.append(step)
+ period_index = []
+ step_index = []
+ for label in self._cluster_order:
+ for step in range(self.time_steps_per_period):
+ period_index.append(label)
+ step_index.append(step)
# create a dataframe
- timeStepMatching = pd.DataFrame(
- [periodIndex, stepIndex],
+ time_step_matching = pd.DataFrame(
+ [period_index, step_index],
index=["PeriodNum", "TimeStep"],
- columns=self.timeIndex,
+ columns=self.time_index,
).T
# if segmentation is chosen, append another column stating which
if self.segmentation:
- segmentIndex = []
- for label in self._clusterOrder:
- segmentIndex.extend(
+ segment_index = []
+ for label in self._cluster_order:
+ segment_index.extend(
np.repeat(
- self.segmentedNormalizedTypicalPeriods.loc[
+ self.segmented_normalized_typical_periods.loc[
label, :
].index.get_level_values(0),
- self.segmentedNormalizedTypicalPeriods.loc[
+ self.segmented_normalized_typical_periods.loc[
label, :
].index.get_level_values(1),
).values
)
- timeStepMatching = pd.DataFrame(
- [periodIndex, stepIndex, segmentIndex],
+ time_step_matching = pd.DataFrame(
+ [period_index, step_index, segment_index],
index=["PeriodNum", "TimeStep", "SegmentIndex"],
- columns=self.timeIndex,
+ columns=self.time_index,
).T
- return timeStepMatching
+ return time_step_matching
- def accuracyIndicators(self):
+ def accuracy_indicators(self):
"""
Compares the predicted data with the original time series.
- :returns: **pd.DataFrame(indicatorRaw)** (pandas.DataFrame) -- Dataframe containing indicators evaluating the
- accuracy of the
- aggregation
+ :returns: **pd.DataFrame(indicator_raw)** (pandas.DataFrame) -- Dataframe containing indicators evaluating the
+ accuracy of the aggregation
"""
- if not hasattr(self, "predictedData"):
- self.predictOriginalData()
-
- indicatorRaw = {
- "RMSE": {},
- "RMSE_duration": {},
- "MAE": {},
- } # 'Silhouette score':{},
-
- for column in self.normalizedTimeSeries.columns:
- if self.weightDict:
- origTS = self.normalizedTimeSeries[column] / self.weightDict[column]
- else:
- origTS = self.normalizedTimeSeries[column]
- predTS = self.normalizedPredictedData[column]
- indicatorRaw["RMSE"][column] = np.sqrt(mean_squared_error(origTS, predTS))
- indicatorRaw["RMSE_duration"][column] = np.sqrt(
- mean_squared_error(
- origTS.sort_values(ascending=False).reset_index(drop=True),
- predTS.sort_values(ascending=False).reset_index(drop=True),
- )
- )
- indicatorRaw["MAE"][column] = mean_absolute_error(origTS, predTS)
+ if not hasattr(self, "_pipeline_result"):
+ self.create_typical_periods()
+ return self._pipeline_result.accuracy_indicators
- return pd.DataFrame(indicatorRaw)
-
- def totalAccuracyIndicators(self):
+ def total_accuracy_indicators(self):
"""
Derives the accuracy indicators over all time series
"""
return np.sqrt(
- self.accuracyIndicators().pow(2).sum()
- / len(self.normalizedTimeSeries.columns)
+ self.accuracy_indicators().pow(2).sum() / len(self.time_series.columns)
)
+
+ # Backward-compatible method aliases (deprecated)
+ createTypicalPeriods = create_typical_periods
+ predictOriginalData = predict_original_data
+ accuracyIndicators = accuracy_indicators
+ totalAccuracyIndicators = total_accuracy_indicators
+ prepareEnersysInput = prepare_enersys_input
+ indexMatching = index_matching
+
+ # Backward-compatible property aliases (deprecated)
+ stepIdx = step_idx
+ clusterPeriodIdx = cluster_period_idx
+ clusterOrder = cluster_order
+ clusterPeriodNoOccur = cluster_period_no_occur
+ clusterPeriodDict = cluster_period_dict
+ segmentDurationDict = segment_duration_dict
diff --git a/src/tsam/tuning.py b/src/tsam/tuning.py
index 7ab6f225..f66b1317 100644
--- a/src/tsam/tuning.py
+++ b/src/tsam/tuning.py
@@ -11,7 +11,7 @@
import tempfile
from concurrent.futures import ProcessPoolExecutor
from contextlib import contextmanager
-from dataclasses import asdict, dataclass, field
+from dataclasses import dataclass, field
from pathlib import Path
from typing import TYPE_CHECKING, TypedDict
@@ -72,9 +72,9 @@ def _test_single_config_file(
)
# Reconstruct configs from serialized dicts
- cluster = ClusterConfig(**opts["cluster_dict"])
+ cluster = ClusterConfig.from_dict(opts["cluster_dict"])
extremes = (
- ExtremeConfig(**opts["extremes_dict"])
+ ExtremeConfig.from_dict(opts["extremes_dict"])
if opts["extremes_dict"] is not None
else None
)
@@ -146,10 +146,10 @@ def _parallel_context(
serialized_opts = {
"period_duration": aggregate_opts["period_duration"],
"temporal_resolution": aggregate_opts["temporal_resolution"],
- "cluster_dict": asdict(aggregate_opts["cluster"]),
+ "cluster_dict": aggregate_opts["cluster"].to_dict(),
"segment_representation": aggregate_opts["segment_representation"],
"extremes_dict": (
- asdict(aggregate_opts["extremes"])
+ aggregate_opts["extremes"].to_dict()
if aggregate_opts["extremes"] is not None
else None
),
diff --git a/src/tsam/utils/durationRepresentation.py b/src/tsam/utils/duration_representation.py
similarity index 54%
rename from src/tsam/utils/durationRepresentation.py
rename to src/tsam/utils/duration_representation.py
index 3a661f32..5bc0ffb2 100644
--- a/src/tsam/utils/durationRepresentation.py
+++ b/src/tsam/utils/duration_representation.py
@@ -6,50 +6,50 @@
import pandas as pd
-def durationRepresentation(
+def duration_representation(
candidates,
- clusterOrder,
- distributionPeriodWise,
- timeStepsPerPeriod,
- representMinMax=False,
+ cluster_order,
+ distribution_period_wise,
+ n_timesteps_per_period,
+ represent_min_max=False,
):
"""
- Represents the candidates of a given cluster group (clusterOrder)
+ Represents the candidates of a given cluster group (cluster_order)
such that for every attribute the number of time steps is best fit.
:param candidates: Dissimilarity matrix where each row represents a candidate
:type candidates: np.ndarray
- :param clusterOrder: Integer array where the index refers to the candidate and the Integer entry to the group
- :type clusterOrder: np.array
+ :param cluster_order: Integer array where the index refers to the candidate and the Integer entry to the group
+ :type cluster_order: np.array
- :param representMinMax: If in every cluster the minimum and the maximum of the attribute should be represented
- :type representMinMax: bool
+ :param represent_min_max: If in every cluster the minimum and the maximum of the attribute should be represented
+ :type represent_min_max: bool
"""
# make pd.DataFrame each row represents a candidate, and the columns are defined by two levels: the attributes and
# the time steps inside the candidates.
- columnTuples = []
- num_attributes = int(candidates.shape[1] / timeStepsPerPeriod)
+ column_tuples = []
+ num_attributes = int(candidates.shape[1] / n_timesteps_per_period)
for i in range(num_attributes):
- for j in range(timeStepsPerPeriod):
- columnTuples.append((i, j))
+ for j in range(n_timesteps_per_period):
+ column_tuples.append((i, j))
candidates_df = pd.DataFrame(
- candidates, columns=pd.MultiIndex.from_tuples(columnTuples)
+ candidates, columns=pd.MultiIndex.from_tuples(column_tuples)
)
# There are two options for the duration representation. Either, the distribution of each cluster is preserved
- # (periodWise = True) or the distribution of the total time series is preserved only. In the latter case, the
+ # (period_wise = True) or the distribution of the total time series is preserved only. In the latter case, the
# inner-cluster variance is smaller and the variance across the typical periods' mean values is higher
- if distributionPeriodWise:
+ if distribution_period_wise:
n_attrs = num_attributes
# Reshape to 3D: (periods, attributes, timesteps)
- candidates_3d = candidates.reshape(-1, n_attrs, timeStepsPerPeriod)
+ candidates_3d = candidates.reshape(-1, n_attrs, n_timesteps_per_period)
- clusterCenters = []
- for clusterNum in np.unique(clusterOrder):
- indice = np.where(clusterOrder == clusterNum)[0]
+ cluster_centers = []
+ for cluster_num in np.unique(cluster_order):
+ indice = np.where(cluster_order == cluster_num)[0]
n_cands = len(indice)
if n_cands == 0:
continue
@@ -59,12 +59,12 @@ def durationRepresentation(
# Sort all values per attribute, then reshape to duration curve
flat = cluster_data.reshape(n_attrs, -1)
- flat = np.sort(flat, axis=1, kind="stable")
- repr_values = flat.reshape(n_attrs, timeStepsPerPeriod, n_cands).mean(
+ flat.sort(axis=1, kind="stable")
+ repr_values = flat.reshape(n_attrs, n_timesteps_per_period, n_cands).mean(
axis=2
)
- if representMinMax:
+ if represent_min_max:
repr_values[:, 0] = flat[:, 0]
repr_values[:, -1] = flat[:, -1]
@@ -77,40 +77,40 @@ def durationRepresentation(
final_repr = np.empty_like(repr_values)
final_repr[rows, order] = repr_values
- clusterCenters.append(final_repr.ravel())
+ cluster_centers.append(final_repr.ravel())
else:
- clusterCentersList = []
+ cluster_centers_list = []
for a in candidates_df.columns.levels[0]:
- meanVals = []
- clusterLengths = []
- for clusterNum in np.unique(clusterOrder):
- indice = np.where(clusterOrder == clusterNum)
- noCandidates = len(indice[0])
+ mean_vals = []
+ cluster_lengths = []
+ for cluster_num in np.unique(cluster_order):
+ indice = np.where(cluster_order == cluster_num)
+ n_candidates = len(indice[0])
# get all the values of a certain attribute and cluster
- candidateValues = candidates_df.loc[indice[0], a]
+ candidate_values = candidates_df.loc[indice[0], a]
# calculate centroid of each cluster and append to list
- meanVals.append(np.round(candidateValues.mean(), 10))
+ mean_vals.append(np.round(candidate_values.mean(), 10))
# make a list of weights of each cluster for each time step within the period
- clusterLengths.append(np.repeat(noCandidates, timeStepsPerPeriod))
+ cluster_lengths.append(np.repeat(n_candidates, n_timesteps_per_period))
# concat centroid values and cluster weights for all clusters
- meansAndWeights = pd.concat(
+ means_and_weights = pd.concat(
[
- pd.DataFrame(np.array(meanVals)).stack(
+ pd.DataFrame(np.array(mean_vals)).stack(
future_stack=True,
),
- pd.DataFrame(np.array(clusterLengths)).stack(
+ pd.DataFrame(np.array(cluster_lengths)).stack(
future_stack=True,
),
],
axis=1,
)
# sort all values of all clusters according to the centroid values
- meansAndWeightsSorted = meansAndWeights.sort_values(0, kind="stable")
+ means_and_weights_sorted = means_and_weights.sort_values(0, kind="stable")
# save order of the sorted centroid values across all clusters
- order = meansAndWeightsSorted.index
+ order = means_and_weights_sorted.index
# sort all values of the original time series
- sortedAttr = (
+ sorted_attr = (
candidates_df.loc[:, a]
.stack(
future_stack=True,
@@ -120,71 +120,71 @@ def durationRepresentation(
)
# take mean of sections of the original duration curve according to the cluster and its weight the
# respective section is assigned to
- representationValues = []
+ representation_values = []
counter = 0
- for i, j in enumerate(meansAndWeightsSorted[1]):
- representationValues.append(sortedAttr[counter : counter + j].mean())
+ for i, j in enumerate(means_and_weights_sorted[1]):
+ representation_values.append(sorted_attr[counter : counter + j].mean())
counter += j
# respect max and min of the attributes
- if representMinMax:
- representationValues = _representMinMax(
- representationValues,
- sortedAttr,
- meansAndWeightsSorted,
- keepSum=True,
+ if represent_min_max:
+ representation_values = _represent_min_max(
+ representation_values,
+ sorted_attr,
+ means_and_weights_sorted,
+ keep_sum=True,
)
# transform all representation values to a data frame and arrange it
# according to the order of the sorted
# centroid values
- representationValues = pd.DataFrame(np.array(representationValues))
- representationValues.index = order
- representationValues.sort_index(inplace=True)
+ representation_values = pd.DataFrame(np.array(representation_values))
+ representation_values.index = order
+ representation_values.sort_index(inplace=True)
# append all cluster values attribute-wise to a list
- clusterCentersList.append(representationValues.unstack())
+ cluster_centers_list.append(representation_values.unstack())
# rearrange so that rows are the cluster centers and columns are time steps x attributes
- clusterCenters = np.array(pd.concat(clusterCentersList, axis=1))
+ cluster_centers = np.array(pd.concat(cluster_centers_list, axis=1))
- return clusterCenters
+ return cluster_centers
-def _representMinMax(
- representationValues, sortedAttr, meansAndWeightsSorted, keepSum=True
+def _represent_min_max(
+ representation_values, sorted_attr, means_and_weights_sorted, keep_sum=True
):
"""
Represents the the min and max values of the original time series in the
duration curve representation such that the min and max values of the
original time series are preserved.
- :param representationValues: The duration curve representation values
- :type representationValues: np.array
+ :param representation_values: The duration curve representation values
+ :type representation_values: np.array
- :param sortedAttr: The sorted original time series
- :type sortedAttr: np.array
+ :param sorted_attr: The sorted original time series
+ :type sorted_attr: np.array
- :param meansAndWeightsSorted: The number of occureance of
+ :param means_and_weights_sorted: The number of occureance of
the original time series.
- :type meansAndWeightsSorted: pd.DataFrame
+ :type means_and_weights_sorted: pd.DataFrame
- :param keepSum: If the sum of the duration curve should be preserved
- :type keepSum: bool
+ :param keep_sum: If the sum of the duration curve should be preserved
+ :type keep_sum: bool
"""
- if np.any(np.array(representationValues) < 0):
+ if np.any(np.array(representation_values) < 0):
raise ValueError("Negative values in the duration curve representation")
# first retrieve the change of the values to the min and max values
# of the original time series and their duration in the original
# time series
- delta_max = sortedAttr.max() - representationValues[-1]
- appearance_max = meansAndWeightsSorted[1].iloc[-1]
- delta_min = sortedAttr.min() - representationValues[0]
- appearance_min = meansAndWeightsSorted[1].iloc[0]
+ delta_max = sorted_attr.max() - representation_values[-1]
+ appearance_max = means_and_weights_sorted[1].iloc[-1]
+ delta_min = sorted_attr.min() - representation_values[0]
+ appearance_min = means_and_weights_sorted[1].iloc[0]
if delta_min == 0 and delta_max == 0:
- return representationValues
+ return representation_values
- if keepSum:
+ if keep_sum:
# now anticipate the shift of the sum of the time series
# due to the change of the min and max values
# of the duration curve
@@ -193,25 +193,27 @@ def _representMinMax(
# the mean of the duration curve
correction_factor = (
-delta_sum
- / (meansAndWeightsSorted[1].iloc[1:-1] * representationValues[1:-1]).sum()
+ / (
+ means_and_weights_sorted[1].iloc[1:-1] * representation_values[1:-1]
+ ).sum()
)
if correction_factor < -1 or correction_factor > 1:
warnings.warn(
"The cluster is too small to preserve the sum of the duration curve and additionally the min and max values of the original cluster members. The min max values of the cluster are not preserved. This does not necessarily mean that the min and max values of the original time series are not preserved."
)
- return representationValues
+ return representation_values
# correct the values of the duration curve such
# that the mean of the duration curve is preserved
# since the min and max values are changed
- representationValues[1:-1] = np.multiply(
- representationValues[1:-1], (1 + correction_factor)
+ representation_values[1:-1] = np.multiply(
+ representation_values[1:-1], (1 + correction_factor)
)
# change the values of the duration curve such that the min and max
# values are preserved
- representationValues[-1] += delta_max
- representationValues[0] += delta_min
+ representation_values[-1] += delta_max
+ representation_values[0] += delta_min
- return representationValues
+ return representation_values
diff --git a/src/tsam/utils/k_maxoids.py b/src/tsam/utils/k_maxoids.py
index 41632281..d1c13852 100644
--- a/src/tsam/utils/k_maxoids.py
+++ b/src/tsam/utils/k_maxoids.py
@@ -89,24 +89,24 @@ def _check_array(self, X):
return X
- def k_maxoids(self, X, k, numpasses=5, doLogarithmic=False, n_init=100):
- X_old = X
+ def k_maxoids(self, X, k, n_passes=5, do_logarithmic=False, n_init=100):
+ x_old = X
n, _m = X.shape
- inertiaTempPrime = None
+ inertia_best = None
for i in range(n_init):
inds = rnd.permutation(np.arange(n))
X = X[inds]
M = np.copy(X[:k])
- for t in range(numpasses):
+ for t in range(n_passes):
for j in range(n):
x = X[j]
D = np.sum((M - x) ** 2, axis=1)
i = np.argmin(D)
d = np.sum((M - M[i]) ** 2, axis=1)
- if doLogarithmic:
+ if do_logarithmic:
D[i] = 1.0
d[i] = 1.0
valx = np.prod(D)
@@ -120,19 +120,19 @@ def k_maxoids(self, X, k, numpasses=5, doLogarithmic=False, n_init=100):
if valx > valm:
M[i] = x
- dTemp = self.distance_func(X_old, Y=list(M))
- inertiaTemp = np.sum(np.min(dTemp, axis=1))
+ d_temp = self.distance_func(x_old, Y=list(M))
+ inertia_temp = np.sum(np.min(d_temp, axis=1))
- if inertiaTempPrime is None:
- mFinal = M
- inertiaTempPrime = inertiaTemp
+ if inertia_best is None:
+ m_final = M
+ inertia_best = inertia_temp
else:
- if inertiaTemp < inertiaTempPrime:
- mFinal = M
- inertiaTempPrime = inertiaTemp
+ if inertia_temp < inertia_best:
+ m_final = M
+ inertia_best = inertia_temp
- D = self.distance_func(X_old, Y=list(mFinal))
+ D = self.distance_func(x_old, Y=list(m_final))
I = np.argmin(D, axis=1)
- return list(mFinal), I
+ return list(m_final), I
diff --git a/src/tsam/utils/segmentation.py b/src/tsam/utils/segmentation.py
index 70a2cfad..ccf59ef0 100644
--- a/src/tsam/utils/segmentation.py
+++ b/src/tsam/utils/segmentation.py
@@ -6,234 +6,235 @@
def segmentation(
- normalizedTypicalPeriods,
- noSegments,
- timeStepsPerPeriod,
- representationMethod=None,
- representationDict=None,
- distributionPeriodWise=True,
- predefSegmentOrder=None,
- predefSegmentDurations=None,
- predefSegmentCenters=None,
+ normalized_typical_periods,
+ n_segments,
+ n_timesteps_per_period,
+ representation_method=None,
+ representation_dict=None,
+ distribution_period_wise=True,
+ predef_segment_order=None,
+ predef_segment_durations=None,
+ predef_segment_centers=None,
):
"""
Agglomerative clustering of adjacent time steps within a set of typical periods in order to further reduce the
temporal resolution within typical periods and to further reduce complexity of input data.
- :param normalizedTypicalPeriods: MultiIndex DataFrame containing the typical periods as first index, the time steps
+ :param normalized_typical_periods: MultiIndex DataFrame containing the typical periods as first index, the time steps
within the periods as second index and the attributes as columns.
- :type normalizedTypicalPeriods: pandas DataFrame
+ :type normalized_typical_periods: pandas DataFrame
- :param noSegments: Number of segments in which the typical periods should be subdivided - equivalent to the number of
+ :param n_segments: Number of segments in which the typical periods should be subdivided - equivalent to the number of
inner-period clusters.
- :type noSegments: integer
+ :type n_segments: integer
- :param timeStepsPerPeriod: Number of time steps per period
- :type timeStepsPerPeriod: integer
+ :param n_timesteps_per_period: Number of time steps per period
+ :type n_timesteps_per_period: integer
- :param predefSegmentOrder: Predefined segment assignments per timestep, per typical period.
+ :param predef_segment_order: Predefined segment assignments per timestep, per typical period.
If provided, skips clustering and uses these assignments directly.
List of lists/arrays, one per typical period.
- :type predefSegmentOrder: list or None
+ :type predef_segment_order: list or None
- :param predefSegmentDurations: Predefined durations per segment, per typical period.
- Required if predefSegmentOrder is provided.
+ :param predef_segment_durations: Predefined durations per segment, per typical period.
+ Required if predef_segment_order is provided.
List of lists/arrays, one per typical period.
- :type predefSegmentDurations: list or None
+ :type predef_segment_durations: list or None
- :param predefSegmentCenters: Predefined center indices per segment, per typical period.
- If provided with predefSegmentOrder, uses these as segment centers
+ :param predef_segment_centers: Predefined center indices per segment, per typical period.
+ If provided with predef_segment_order, uses these as segment centers
instead of calculating representations.
List of lists/arrays, one per typical period.
- :type predefSegmentCenters: list or None
+ :type predef_segment_centers: list or None
- :returns: - **segmentedNormalizedTypicalPeriods** (pandas DataFrame) -- MultiIndex DataFrame similar to
- normalizedTypicalPeriods but with segments instead of time steps. Moreover, two additional index
+ :returns: - **segmented_typical** (pandas DataFrame) -- MultiIndex DataFrame similar to
+ normalized_typical_periods but with segments instead of time steps. Moreover, two additional index
levels define the length of each segment and the time step index at which each segment starts.
- - **predictedSegmentedNormalizedTypicalPeriods** (pandas DataFrame) -- MultiIndex DataFrame with the same
- shape of normalizedTypicalPeriods, but with overwritten values derived from segmentation used for
+ - **predicted_segmented** (pandas DataFrame) -- MultiIndex DataFrame with the same
+ shape of normalized_typical_periods, but with overwritten values derived from segmentation used for
prediction of the original periods and accuracy indicators.
- - **segmentCenterIndicesList** (list) -- List of segment center indices per typical period.
+ - **segment_center_indices_list** (list) -- List of segment center indices per typical period.
Each entry is a list of indices indicating which timestep is the representative for each segment.
"""
# Initialize lists for predicted and segmented DataFrame
- segmentedNormalizedTypicalPeriodsList = []
- predictedSegmentedNormalizedTypicalPeriodsList = []
- segmentCenterIndicesList = []
+ segmented_list = []
+ predicted_list = []
+ segment_center_indices_list = []
# Get unique period indices
- period_indices = normalizedTypicalPeriods.index.get_level_values(0).unique()
+ period_indices = normalized_typical_periods.index.get_level_values(0).unique()
n_clusters = len(period_indices)
# Validate predefined segment array lengths
- if predefSegmentOrder is not None:
- if len(predefSegmentOrder) != n_clusters:
+ if predef_segment_order is not None:
+ if len(predef_segment_order) != n_clusters:
raise ValueError(
- f"predefSegmentOrder has {len(predefSegmentOrder)} entries "
+ f"predef_segment_order has {len(predef_segment_order)} entries "
f"but data has {n_clusters} periods"
)
if (
- predefSegmentDurations is not None
- and len(predefSegmentDurations) != n_clusters
+ predef_segment_durations is not None
+ and len(predef_segment_durations) != n_clusters
):
raise ValueError(
- f"predefSegmentDurations has {len(predefSegmentDurations)} entries "
+ f"predef_segment_durations has {len(predef_segment_durations)} entries "
f"but data has {n_clusters} periods"
)
- if predefSegmentCenters is not None and len(predefSegmentCenters) != n_clusters:
+ if (
+ predef_segment_centers is not None
+ and len(predef_segment_centers) != n_clusters
+ ):
raise ValueError(
- f"predefSegmentCenters has {len(predefSegmentCenters)} entries "
+ f"predef_segment_centers has {len(predef_segment_centers)} entries "
f"but data has {n_clusters} periods"
)
# Validate segment durations sum to timesteps per period
- if predefSegmentDurations is not None:
- for i, durations in enumerate(predefSegmentDurations):
+ if predef_segment_durations is not None:
+ for i, durations in enumerate(predef_segment_durations):
duration_sum = sum(durations)
- if duration_sum != timeStepsPerPeriod:
+ if duration_sum != n_timesteps_per_period:
raise ValueError(
- f"predefSegmentDurations for period {i} sum to {duration_sum} "
- f"but timeStepsPerPeriod is {timeStepsPerPeriod}"
+ f"predef_segment_durations for period {i} sum to {duration_sum} "
+ f"but n_timesteps_per_period is {n_timesteps_per_period}"
)
# Validate segment center indices are within bounds
- if predefSegmentCenters is not None:
- for i, centers in enumerate(predefSegmentCenters):
+ if predef_segment_centers is not None:
+ for i, centers in enumerate(predef_segment_centers):
for idx in centers:
- if idx < 0 or idx >= timeStepsPerPeriod:
+ if idx < 0 or idx >= n_timesteps_per_period:
raise ValueError(
- f"predefSegmentCenters index {idx} for period {i} "
- f"is out of bounds [0, {timeStepsPerPeriod})"
+ f"predef_segment_centers index {idx} for period {i} "
+ f"is out of bounds [0, {n_timesteps_per_period})"
)
# do for each typical period
for period_i, period_label in enumerate(period_indices):
# make numpy array with rows containing the segmentation candidates (time steps)
# and columns as dimensions of the
- segmentationCandidates = np.asarray(
- normalizedTypicalPeriods.loc[period_label, :]
+ segmentation_candidates = np.asarray(
+ normalized_typical_periods.loc[period_label, :]
)
# Check if using predefined segments for this period
- if predefSegmentOrder is not None:
+ if predef_segment_order is not None:
# Use predefined segment order
- clusterOrder = np.asarray(predefSegmentOrder[period_i])
+ cluster_order = np.asarray(predef_segment_order[period_i])
# Get predefined durations
- segmentNoOccur = np.asarray(predefSegmentDurations[period_i])
+ segment_no_occur = np.asarray(predef_segment_durations[period_i])
# Calculate segment numbers and start indices from durations
- segNo = np.arange(noSegments)
- indices = np.concatenate([[0], np.cumsum(segmentNoOccur)[:-1]])
+ seg_no = np.arange(n_segments)
+ indices = np.concatenate([[0], np.cumsum(segment_no_occur)[:-1]])
# The unique cluster order is just 0, 1, 2, ..., n_segments-1 in order
- clusterOrderUnique = list(range(noSegments))
+ cluster_order_unique = list(range(n_segments))
# Determine segment values
- if predefSegmentCenters is not None:
+ if predef_segment_centers is not None:
# Use predefined centers directly
- segmentCenterIndices = list(predefSegmentCenters[period_i])
- clusterCenters = segmentationCandidates[segmentCenterIndices]
+ segment_center_indices = list(predef_segment_centers[period_i])
+ cluster_centers = segmentation_candidates[segment_center_indices]
else:
# Calculate representations from predefined order
- clusterCenters, segmentCenterIndices = representations(
- segmentationCandidates,
- clusterOrder,
- default="meanRepresentation",
- representationMethod=representationMethod,
- representationDict=representationDict,
- distributionPeriodWise=distributionPeriodWise,
- timeStepsPerPeriod=1,
+ cluster_centers, segment_center_indices = representations(
+ segmentation_candidates,
+ cluster_order,
+ default="mean",
+ representation_method=representation_method,
+ representation_dict=representation_dict,
+ distribution_period_wise=distribution_period_wise,
+ n_timesteps_per_period=1,
)
else:
# Original clustering logic
# produce adjacency matrix: Each time step is only connected to its preceding and succeeding one
- adjacencyMatrix = np.eye(timeStepsPerPeriod, k=1) + np.eye(
- timeStepsPerPeriod, k=-1
+ adjacency_matrix = np.eye(n_timesteps_per_period, k=1) + np.eye(
+ n_timesteps_per_period, k=-1
)
# execute clustering of adjacent time steps
- if noSegments == 1:
- clusterOrder = np.asarray([0] * len(segmentationCandidates))
+ if n_segments == 1:
+ cluster_order = np.asarray([0] * len(segmentation_candidates))
else:
clustering = AgglomerativeClustering(
- n_clusters=noSegments, linkage="ward", connectivity=adjacencyMatrix
+ n_clusters=n_segments, linkage="ward", connectivity=adjacency_matrix
)
- clusterOrder = clustering.fit_predict(segmentationCandidates)
+ cluster_order = clustering.fit_predict(segmentation_candidates)
# Relabel clusters to temporal order (0 = first segment, 1 = second, ...)
# so that stored assignments are deterministic when reapplied.
- _, first_indices = np.unique(clusterOrder, return_index=True)
+ _, first_indices = np.unique(cluster_order, return_index=True)
temporal_order = np.argsort(first_indices)
- label_map = np.empty(noSegments, dtype=int)
- label_map[temporal_order] = np.arange(noSegments)
- clusterOrder = label_map[clusterOrder]
+ label_map = np.empty(n_segments, dtype=int)
+ label_map[temporal_order] = np.arange(n_segments)
+ cluster_order = label_map[cluster_order]
# determine the indices where the segments change and the number of time steps in each segment
- segNo, indices, segmentNoOccur = np.unique(
- clusterOrder, return_index=True, return_counts=True
+ seg_no, indices, segment_no_occur = np.unique(
+ cluster_order, return_index=True, return_counts=True
)
- clusterOrderUnique = [clusterOrder[index] for index in sorted(indices)]
+ cluster_order_unique = [cluster_order[index] for index in sorted(indices)]
# determine the segments' values
- clusterCenters, segmentCenterIndices = representations(
- segmentationCandidates,
- clusterOrder,
- default="meanRepresentation",
- representationMethod=representationMethod,
- representationDict=representationDict,
- distributionPeriodWise=distributionPeriodWise,
- timeStepsPerPeriod=1,
+ cluster_centers, segment_center_indices = representations(
+ segmentation_candidates,
+ cluster_order,
+ default="mean",
+ representation_method=representation_method,
+ representation_dict=representation_dict,
+ distribution_period_wise=distribution_period_wise,
+ n_timesteps_per_period=1,
)
- # Reorder segment center indices to match temporal order (clusterOrderUnique)
- if segmentCenterIndices is not None:
- segmentCenterIndices = [
- segmentCenterIndices[c] for c in clusterOrderUnique
+ # Reorder segment center indices to match temporal order (cluster_order_unique)
+ if segment_center_indices is not None:
+ segment_center_indices = [
+ segment_center_indices[c] for c in cluster_order_unique
]
# predict each time step of the period by representing it with the corresponding segment's values
- predictedSegmentedNormalizedTypicalPeriods = (
- pd.DataFrame(clusterCenters, columns=normalizedTypicalPeriods.columns)
- .reindex(clusterOrder)
+ predicted_segmented = (
+ pd.DataFrame(cluster_centers, columns=normalized_typical_periods.columns)
+ .reindex(cluster_order)
.reset_index(drop=True)
)
# represent the period by the segments in the right order only instead of each time step
- segmentedNormalizedTypicalPeriods = (
- pd.DataFrame(clusterCenters, columns=normalizedTypicalPeriods.columns)
- .reindex(clusterOrderUnique)
+ segmented_typical = (
+ pd.DataFrame(cluster_centers, columns=normalized_typical_periods.columns)
+ .reindex(cluster_order_unique)
.set_index(np.sort(indices))
)
# keep additional information on the lengths of the segments in the right order
- segmentDuration = (
- pd.DataFrame(segmentNoOccur, columns=["Segment Duration"])
- .reindex(clusterOrderUnique)
+ segment_duration = (
+ pd.DataFrame(segment_no_occur, columns=["Segment Duration"])
+ .reindex(cluster_order_unique)
.set_index(np.sort(indices))
)
# create DataFrame with reduced number of segments together with three indices per period:
# 1. The segment number
# 2. The segment duration
# 3. The index of the original time step, at which the segment starts
- result = segmentedNormalizedTypicalPeriods.set_index(
+ result = segmented_typical.set_index(
[
- pd.Index(segNo, name="Segment Step"),
- segmentDuration["Segment Duration"],
+ pd.Index(seg_no, name="Segment Step"),
+ segment_duration["Segment Duration"],
pd.Index(np.sort(indices), name="Original Start Step"),
]
)
# append predicted and segmented DataFrame to list to create a big DataFrame for all periods
- predictedSegmentedNormalizedTypicalPeriodsList.append(
- predictedSegmentedNormalizedTypicalPeriods
- )
- segmentedNormalizedTypicalPeriodsList.append(result)
- segmentCenterIndicesList.append(segmentCenterIndices)
+ predicted_list.append(predicted_segmented)
+ segmented_list.append(result)
+ segment_center_indices_list.append(segment_center_indices)
# create a big DataFrame for all periods for predicted segmented time steps and segments and return
- predictedSegmentedNormalizedTypicalPeriods = pd.concat(
- predictedSegmentedNormalizedTypicalPeriodsList,
+ predicted_segmented = pd.concat(
+ predicted_list,
keys=period_indices,
).rename_axis(["", "TimeStep"])
- segmentedNormalizedTypicalPeriods = pd.concat(
- segmentedNormalizedTypicalPeriodsList,
+ segmented_typical = pd.concat(
+ segmented_list,
keys=period_indices,
)
return (
- segmentedNormalizedTypicalPeriods,
- predictedSegmentedNormalizedTypicalPeriods,
- segmentCenterIndicesList,
+ segmented_typical,
+ predicted_segmented,
+ segment_center_indices_list,
)
diff --git a/src/tsam/weights.py b/src/tsam/weights.py
new file mode 100644
index 00000000..adc2ed29
--- /dev/null
+++ b/src/tsam/weights.py
@@ -0,0 +1,70 @@
+"""Unified weight validation for tsam."""
+
+from __future__ import annotations
+
+import warnings
+from typing import TYPE_CHECKING
+
+from tsam.options import options
+
+if TYPE_CHECKING:
+ import pandas as pd
+
+
+def __getattr__(name: str):
+ """Backward compat: ``MIN_WEIGHT`` now lives in ``tsam.options``."""
+ if name == "MIN_WEIGHT":
+ return options.min_weight
+ raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
+
+
+def validate_weights(
+ columns: pd.Index,
+ weights: dict[str, float] | None,
+) -> dict[str, float] | None:
+ """Validate and normalize per-column clustering weights.
+
+ Consolidates:
+ - Column existence check (raises ValueError for unknown columns)
+ - min_weight clamping (warns and clamps near-zero weights)
+ - Returns None if all weights are effectively 1.0
+
+ Parameters
+ ----------
+ columns : pd.Index
+ Column names from the input data.
+ weights : dict or None
+ Per-column weights. Columns not listed default to 1.0.
+
+ Returns
+ -------
+ dict or None
+ Validated weights dict, or None if no weighting is needed
+ (all weights are 1.0 or input is None/empty).
+
+ Raises
+ ------
+ ValueError
+ If any weight key is not present in *columns*.
+ """
+ if not weights:
+ return None
+
+ missing = set(weights.keys()) - set(columns)
+ if missing:
+ raise ValueError(f"Weight columns not found in data: {missing}")
+
+ any_non_unit = False
+ cleaned: dict[str, float] = {}
+ for col, w in weights.items():
+ if w < options.min_weight:
+ warnings.warn(
+ f'weight of "{col}" set to the minimal tolerable weighting',
+ stacklevel=2,
+ )
+ w = options.min_weight
+ if w != 1.0:
+ any_non_unit = True
+ cleaned[col] = w
+
+ return cleaned if any_non_unit else None
diff --git a/test/_old_new_equivalence.py b/test/_old_new_equivalence.py
index 2b7f0ec8..e380152a 100644
--- a/test/_old_new_equivalence.py
+++ b/test/_old_new_equivalence.py
@@ -448,6 +448,8 @@
"kmeans_weighted_segmentation": 1e-5,
}
+_SKIP_EQUIVALENCE: set[str] = set()
+
_WINDOWS_OPENMP_RUNTIME_WARNING_CASE_IDS = {
"kmeans_distribution/testdata",
"kmeans_segmentation/testdata",
@@ -475,6 +477,7 @@ class EquivalenceCase:
new_kwargs: dict
seed: int | None = None
rtol: float = 1e-10
+ skip_equivalence: bool = False
max_timesteps: int | None = None
@@ -501,6 +504,7 @@ def _build_cases() -> list[EquivalenceCase]:
new_kwargs=new_kw,
seed=base.seed,
rtol=_RTOL.get(base.id, 1e-10),
+ skip_equivalence=base.id in _SKIP_EQUIVALENCE,
max_timesteps=base.max_timesteps,
)
)
@@ -568,6 +572,8 @@ class TestOldNewEquivalence:
@pytest.mark.parametrize("case", CASES, ids=case_ids(CASES))
def test_cluster_representatives(self, case: EquivalenceCase):
"""Typical-period DataFrames must be equal."""
+ if case.skip_equivalence:
+ pytest.skip("intentional old/new divergence (weight decoupling)")
data = get_data(case.dataset)
with _suppress_windows_kmeans_warnings(case):
old_result, _ = _run_old(data, case)
@@ -595,6 +601,8 @@ def test_cluster_assignments(self, case: EquivalenceCase):
@pytest.mark.parametrize("case", CASES, ids=case_ids(CASES))
def test_accuracy(self, case: EquivalenceCase):
"""RMSE and MAE must match within tolerance."""
+ if case.skip_equivalence:
+ pytest.skip("intentional old/new divergence (weight decoupling)")
data = get_data(case.dataset)
with _suppress_windows_kmeans_warnings(case):
_, old_agg = _run_old(data, case)
@@ -616,6 +624,8 @@ def test_accuracy(self, case: EquivalenceCase):
@pytest.mark.parametrize("case", CASES, ids=case_ids(CASES))
def test_reconstruction(self, case: EquivalenceCase):
"""Reconstructed time series must match."""
+ if case.skip_equivalence:
+ pytest.skip("intentional old/new divergence (weight decoupling)")
data = get_data(case.dataset)
with _suppress_windows_kmeans_warnings(case):
_, old_agg = _run_old(data, case)
diff --git a/test/test_accuracyIndicators.py b/test/test_accuracyIndicators.py
index d5ddb129..db870c8e 100644
--- a/test/test_accuracyIndicators.py
+++ b/test/test_accuracyIndicators.py
@@ -6,50 +6,50 @@
def test_accuracyIndicators():
- hoursPerPeriod = 24
+ hours_per_period = 24
- noTypicalPeriods = 8
+ no_typical_periods = 8
raw = pd.read_csv(TESTDATA_CSV, index_col=0)
aggregation1 = tsam.TimeSeriesAggregation(
raw,
- noTypicalPeriods=noTypicalPeriods,
- hoursPerPeriod=hoursPerPeriod,
- clusterMethod="hierarchical",
+ no_typical_periods=no_typical_periods,
+ hours_per_period=hours_per_period,
+ cluster_method="hierarchical",
)
aggregation2 = tsam.TimeSeriesAggregation(
raw,
- noTypicalPeriods=noTypicalPeriods,
- hoursPerPeriod=hoursPerPeriod,
- clusterMethod="hierarchical",
- sortValues=True,
+ no_typical_periods=no_typical_periods,
+ hours_per_period=hours_per_period,
+ cluster_method="hierarchical",
+ sort_values=True,
)
# make sure that the sum of the attribute specific RMSEs is smaller for the normal time series clustering than for
# the duration curve clustering
np.testing.assert_array_less(
- aggregation1.accuracyIndicators().loc[:, "RMSE"].sum(),
- aggregation2.accuracyIndicators().loc[:, "RMSE"].sum(),
+ aggregation1.accuracy_indicators().loc[:, "RMSE"].sum(),
+ aggregation2.accuracy_indicators().loc[:, "RMSE"].sum(),
)
# make sure that the sum of the attribute specific duration curve RMSEs is smaller for the duration curve
# clustering than for the normal time series clustering
np.testing.assert_array_less(
- aggregation2.accuracyIndicators().loc[:, "RMSE_duration"].sum(),
- aggregation1.accuracyIndicators().loc[:, "RMSE_duration"].sum(),
+ aggregation2.accuracy_indicators().loc[:, "RMSE_duration"].sum(),
+ aggregation1.accuracy_indicators().loc[:, "RMSE_duration"].sum(),
)
# make sure that the same accounts for the total accuracy indicator
np.testing.assert_array_less(
- aggregation1.totalAccuracyIndicators()["RMSE"],
- aggregation2.totalAccuracyIndicators()["RMSE"],
+ aggregation1.total_accuracy_indicators()["RMSE"],
+ aggregation2.total_accuracy_indicators()["RMSE"],
)
# make sure that the same accounts for the total accuracy indicator
np.testing.assert_array_less(
- aggregation2.totalAccuracyIndicators()["RMSE_duration"],
- aggregation1.totalAccuracyIndicators()["RMSE_duration"],
+ aggregation2.total_accuracy_indicators()["RMSE_duration"],
+ aggregation1.total_accuracy_indicators()["RMSE_duration"],
)
diff --git a/test/test_adjacent_periods.py b/test/test_adjacent_periods.py
index 1d7b9efd..fde5a8b7 100644
--- a/test/test_adjacent_periods.py
+++ b/test/test_adjacent_periods.py
@@ -10,26 +10,26 @@
def test_adjacent_periods():
raw = pd.read_csv(TESTDATA_CSV, index_col=0)
- noTypicalPeriods = 8
+ no_typical_periods = 8
starttime = time.time()
aggregation = tsam.TimeSeriesAggregation(
raw,
- noTypicalPeriods=noTypicalPeriods,
- hoursPerPeriod=24,
- clusterMethod="adjacent_periods",
- representationMethod="meanRepresentation",
+ no_typical_periods=no_typical_periods,
+ hours_per_period=24,
+ cluster_method="adjacent_periods",
+ representation_method="meanRepresentation",
)
- typPeriods = aggregation.createTypicalPeriods()
+ typPeriods = aggregation.create_typical_periods()
print("Clustering took " + str(time.time() - starttime))
- # check whether the clusterOrder consists of noTypicalPeriods blocks of the same number
+ # check whether the cluster_order consists of no_typical_periods blocks of the same number
np.testing.assert_array_almost_equal(
- np.size(np.where(np.diff(aggregation.clusterOrder) != 0)),
- noTypicalPeriods - 1,
+ np.size(np.where(np.diff(aggregation.cluster_order) != 0)),
+ no_typical_periods - 1,
decimal=4,
)
diff --git a/test/test_aggregate_hiearchical.py b/test/test_aggregate_hiearchical.py
index 7fc2a79d..afeeb644 100644
--- a/test/test_aggregate_hiearchical.py
+++ b/test/test_aggregate_hiearchical.py
@@ -12,10 +12,10 @@ def test_aggregate_hiearchical():
header=[0, 1],
)
- _clusterCenters, _clusterCenterIndices, clusterOrder = tsam.aggregatePeriods(
+ _clusterCenters, _clusterCenterIndices, clusterOrder = tsam.aggregate_periods(
normalizedPeriodlyProfiles.values,
n_clusters=8,
- clusterMethod="hierarchical",
+ cluster_method="hierarchical",
)
orig = [
diff --git a/test/test_api_equivalence.py b/test/test_api_equivalence.py
index c78fa05a..c484c9df 100644
--- a/test/test_api_equivalence.py
+++ b/test/test_api_equivalence.py
@@ -47,11 +47,11 @@ def test_hierarchical_default(self, sample_data):
# Old API
old_agg = old_tsam.TimeSeriesAggregation(
sample_data,
- noTypicalPeriods=8,
- hoursPerPeriod=24,
- clusterMethod="hierarchical",
+ no_typical_periods=8,
+ hours_per_period=24,
+ cluster_method="hierarchical",
)
- old_result = old_agg.createTypicalPeriods()
+ old_result = old_agg.create_typical_periods()
# New API
new_result = aggregate(
@@ -70,11 +70,11 @@ def test_hierarchical_default(self, sample_data):
# Compare cluster assignments
np.testing.assert_array_equal(
- old_agg.clusterOrder, new_result.cluster_assignments
+ old_agg.cluster_order, new_result.cluster_assignments
)
# Compare accuracy
- old_accuracy = old_agg.accuracyIndicators()
+ old_accuracy = old_agg.accuracy_indicators()
np.testing.assert_allclose(
old_accuracy["RMSE"].values,
new_result.accuracy.rmse.values,
@@ -92,11 +92,11 @@ def test_kmeans(self, sample_data):
# Old API
old_agg = old_tsam.TimeSeriesAggregation(
sample_data,
- noTypicalPeriods=8,
- hoursPerPeriod=24,
- clusterMethod="k_means",
+ no_typical_periods=8,
+ hours_per_period=24,
+ cluster_method="k_means",
)
- old_result = old_agg.createTypicalPeriods()
+ old_result = old_agg.create_typical_periods()
# Reset seed to get same random state for new API
np.random.seed(42)
@@ -115,7 +115,7 @@ def test_kmeans(self, sample_data):
check_names=False,
)
- old_accuracy = old_agg.accuracyIndicators()
+ old_accuracy = old_agg.accuracy_indicators()
np.testing.assert_allclose(
old_accuracy["RMSE"].values,
new_result.accuracy.rmse.values,
@@ -127,12 +127,12 @@ def test_hierarchical_with_medoid(self, sample_data):
# Old API
old_agg = old_tsam.TimeSeriesAggregation(
sample_data,
- noTypicalPeriods=8,
- hoursPerPeriod=24,
- clusterMethod="hierarchical",
- representationMethod="medoidRepresentation",
+ no_typical_periods=8,
+ hours_per_period=24,
+ cluster_method="hierarchical",
+ representation_method="medoidRepresentation",
)
- old_result = old_agg.createTypicalPeriods()
+ old_result = old_agg.create_typical_periods()
# New API
new_result = aggregate(
@@ -149,18 +149,24 @@ def test_hierarchical_with_medoid(self, sample_data):
)
def test_with_weights(self, sample_data):
- """Test weighted clustering."""
+ """Test weighted clustering produces same cluster assignments.
+
+ The new API applies weights only for clustering distance (not baked
+ into normalized data), so medoid selection may differ from the old API.
+ Cluster assignments must still match since the distance metric is
+ equivalent for assignment purposes.
+ """
weights = {"Load": 2.0, "GHI": 1.0, "T": 1.0, "Wind": 1.0}
# Old API
old_agg = old_tsam.TimeSeriesAggregation(
sample_data,
- noTypicalPeriods=8,
- hoursPerPeriod=24,
- clusterMethod="hierarchical",
- weightDict=weights,
+ no_typical_periods=8,
+ hours_per_period=24,
+ cluster_method="hierarchical",
+ weight_dict=weights,
)
- old_result = old_agg.createTypicalPeriods()
+ old_agg.create_typical_periods()
# New API
new_result = aggregate(
@@ -171,10 +177,10 @@ def test_with_weights(self, sample_data):
weights=weights,
)
- pd.testing.assert_frame_equal(
- old_result,
- new_result.cluster_representatives,
- check_names=False,
+ # Cluster assignments must be identical (same weighted distance)
+ np.testing.assert_array_equal(
+ old_agg.cluster_order,
+ new_result.cluster_assignments,
)
def test_with_segmentation(self, sample_data):
@@ -182,13 +188,13 @@ def test_with_segmentation(self, sample_data):
# Old API
old_agg = old_tsam.TimeSeriesAggregation(
sample_data,
- noTypicalPeriods=8,
- hoursPerPeriod=24,
- clusterMethod="hierarchical",
+ no_typical_periods=8,
+ hours_per_period=24,
+ cluster_method="hierarchical",
segmentation=True,
- noSegments=12,
+ no_segments=12,
)
- old_result = old_agg.createTypicalPeriods()
+ old_result = old_agg.create_typical_periods()
# New API
new_result = aggregate(
@@ -210,12 +216,12 @@ def test_with_duration_curves(self, sample_data):
# Old API
old_agg = old_tsam.TimeSeriesAggregation(
sample_data,
- noTypicalPeriods=8,
- hoursPerPeriod=24,
- clusterMethod="hierarchical",
- representationMethod="durationRepresentation",
+ no_typical_periods=8,
+ hours_per_period=24,
+ cluster_method="hierarchical",
+ representation_method="durationRepresentation",
)
- old_result = old_agg.createTypicalPeriods()
+ old_result = old_agg.create_typical_periods()
# New API
new_result = aggregate(
@@ -236,13 +242,13 @@ def test_with_extremes_append(self, sample_data):
# Old API
old_agg = old_tsam.TimeSeriesAggregation(
sample_data,
- noTypicalPeriods=8,
- hoursPerPeriod=24,
- clusterMethod="hierarchical",
- extremePeriodMethod="append",
- addPeakMax=["Load"],
+ no_typical_periods=8,
+ hours_per_period=24,
+ cluster_method="hierarchical",
+ extreme_period_method="append",
+ add_peak_max=["Load"],
)
- old_result = old_agg.createTypicalPeriods()
+ old_result = old_agg.create_typical_periods()
# New API
new_result = aggregate(
@@ -264,11 +270,11 @@ def test_contiguous_clustering(self, sample_data):
# Old API
old_agg = old_tsam.TimeSeriesAggregation(
sample_data,
- noTypicalPeriods=8,
- hoursPerPeriod=24,
- clusterMethod="adjacent_periods",
+ no_typical_periods=8,
+ hours_per_period=24,
+ cluster_method="adjacent_periods",
)
- old_result = old_agg.createTypicalPeriods()
+ old_result = old_agg.create_typical_periods()
# New API
new_result = aggregate(
@@ -289,12 +295,12 @@ def test_rescale_off(self, sample_data):
# Old API
old_agg = old_tsam.TimeSeriesAggregation(
sample_data,
- noTypicalPeriods=8,
- hoursPerPeriod=24,
- clusterMethod="hierarchical",
- rescaleClusterPeriods=False,
+ no_typical_periods=8,
+ hours_per_period=24,
+ cluster_method="hierarchical",
+ rescale_cluster_periods=False,
)
- old_result = old_agg.createTypicalPeriods()
+ old_result = old_agg.create_typical_periods()
# New API
new_result = aggregate(
@@ -316,12 +322,12 @@ def test_distribution_minmax_representation(self, sample_data):
# Old API
old_agg = old_tsam.TimeSeriesAggregation(
sample_data,
- noTypicalPeriods=8,
- hoursPerPeriod=24,
- clusterMethod="hierarchical",
- representationMethod="distributionAndMinMaxRepresentation",
+ no_typical_periods=8,
+ hours_per_period=24,
+ cluster_method="hierarchical",
+ representation_method="distributionAndMinMaxRepresentation",
)
- old_result = old_agg.createTypicalPeriods()
+ old_result = old_agg.create_typical_periods()
# New API
new_result = aggregate(
@@ -547,7 +553,7 @@ def test_find_clusters_for_reduction(self):
]
for n_timesteps, n_segments, data_reduction in test_cases:
- old_result = old_tune.getNoPeriodsForDataReduction(
+ old_result = old_tune.get_no_periods_for_data_reduction(
n_timesteps, n_segments, data_reduction
)
new_result = find_clusters_for_reduction(
@@ -566,7 +572,7 @@ def test_find_segments_for_reduction(self):
]
for n_timesteps, n_clusters, data_reduction in test_cases:
- old_result = old_tune.getNoSegmentsForDataReduction(
+ old_result = old_tune.get_no_segments_for_data_reduction(
n_timesteps, n_clusters, data_reduction
)
new_result = find_segments_for_reduction(
@@ -575,7 +581,7 @@ def test_find_segments_for_reduction(self):
assert old_result == new_result
def test_find_optimal_combination(self, sample_data):
- """Test find_optimal_combination matches old identifyOptimalSegmentPeriodCombination."""
+ """Test find_optimal_combination matches old identify_optimal_segment_period_combination."""
data_reduction = 0.01
col = "Wind"
data = sample_data[[col]]
@@ -584,16 +590,16 @@ def test_find_optimal_combination(self, sample_data):
old_tuner = old_tune.HyperTunedAggregations(
old_tsam.TimeSeriesAggregation(
data,
- hoursPerPeriod=24,
- clusterMethod="hierarchical",
- representationMethod="durationRepresentation",
- # Use defaults: distributionPeriodWise=True, rescaleClusterPeriods=True
+ hours_per_period=24,
+ cluster_method="hierarchical",
+ representation_method="durationRepresentation",
+ # Use defaults: distribution_period_wise=True, rescale_cluster_periods=True
segmentation=True,
)
)
old_segments, old_periods, old_rmse = (
- old_tuner.identifyOptimalSegmentPeriodCombination(
- dataReduction=data_reduction
+ old_tuner.identify_optimal_segment_period_combination(
+ data_reduction=data_reduction
)
)
@@ -620,16 +626,16 @@ def test_find_pareto_front(self, small_data):
old_tuner = old_tune.HyperTunedAggregations(
old_tsam.TimeSeriesAggregation(
small_data,
- hoursPerPeriod=12,
- clusterMethod="hierarchical",
- representationMethod="meanRepresentation",
- distributionPeriodWise=False,
- rescaleClusterPeriods=False,
+ hours_per_period=12,
+ cluster_method="hierarchical",
+ representation_method="meanRepresentation",
+ distribution_period_wise=False,
+ rescale_cluster_periods=False,
segmentation=True,
)
)
- old_tuner.identifyParetoOptimalAggregation()
- old_rmse_history = old_tuner._RMSEHistory
+ old_tuner.identify_pareto_optimal_aggregation()
+ old_rmse_history = old_tuner._rmse_history
# New API
new_results = find_pareto_front(
@@ -690,12 +696,12 @@ def test_15min_resolution(self):
# Old API with explicit resolution
old_agg = old_tsam.TimeSeriesAggregation(
data,
- noTypicalPeriods=4,
- hoursPerPeriod=24,
+ no_typical_periods=4,
+ hours_per_period=24,
resolution=0.25, # 15 minutes = 0.25 hours
- clusterMethod="hierarchical",
+ cluster_method="hierarchical",
)
- old_result = old_agg.createTypicalPeriods()
+ old_result = old_agg.create_typical_periods()
# New API (should infer resolution)
new_result = aggregate(
@@ -741,16 +747,16 @@ class TestReconstructionEquivalence:
"""Test that reconstruction produces identical results."""
def test_reconstruct_matches_old_predict(self, sample_data):
- """Test that reconstructed() matches predictOriginalData()."""
+ """Test that reconstructed() matches predict_original_data()."""
# Old API
old_agg = old_tsam.TimeSeriesAggregation(
sample_data,
- noTypicalPeriods=8,
- hoursPerPeriod=24,
- clusterMethod="hierarchical",
+ no_typical_periods=8,
+ hours_per_period=24,
+ cluster_method="hierarchical",
)
- old_agg.createTypicalPeriods()
- old_reconstructed = old_agg.predictOriginalData()
+ old_agg.create_typical_periods()
+ old_reconstructed = old_agg.predict_original_data()
# New API
new_result = aggregate(
diff --git a/test/test_assert_raises.py b/test/test_assert_raises.py
index 0ed58a63..14216386 100644
--- a/test/test_assert_raises.py
+++ b/test/test_assert_raises.py
@@ -2,7 +2,6 @@
import numpy as np
import pandas as pd
-import pytest
import tsam.timeseriesaggregation as tsam
from conftest import TESTDATA_CSV
@@ -16,40 +15,40 @@ def test_assert_raises():
# check error message for wrong time series
np.testing.assert_raises_regex(
ValueError,
- r"timeSeries has to be of type pandas.DataFrame\(\) or of type np.array\(\) in "
+ r"time_series has to be of type pandas.DataFrame\(\) or of type np.array\(\) in "
"initialization of object of class TimeSeriesAggregation",
tsam.TimeSeriesAggregation,
- timeSeries="erroneousTimeSeries",
+ time_series="erroneousTimeSeries",
)
# check error messages for wrong attribute names added for extreme period methods
np.testing.assert_raises_regex(
ValueError,
- 'erroneousAttribute listed in "addPeakMin" does not occur as timeSeries column',
+ 'erroneousAttribute listed in "add_peak_min" does not occur as time_series column',
tsam.TimeSeriesAggregation,
- timeSeries=raw,
- addPeakMin=["erroneousAttribute"],
+ time_series=raw,
+ add_peak_min=["erroneousAttribute"],
)
np.testing.assert_raises_regex(
ValueError,
- 'erroneousAttribute listed in "addPeakMax" does not occur as timeSeries column',
+ 'erroneousAttribute listed in "add_peak_max" does not occur as time_series column',
tsam.TimeSeriesAggregation,
- timeSeries=raw,
- addPeakMax=["erroneousAttribute"],
+ time_series=raw,
+ add_peak_max=["erroneousAttribute"],
)
np.testing.assert_raises_regex(
ValueError,
- 'erroneousAttribute listed in "addMeanMin" does not occur as timeSeries column',
+ 'erroneousAttribute listed in "add_mean_min" does not occur as time_series column',
tsam.TimeSeriesAggregation,
- timeSeries=raw,
- addMeanMin=["erroneousAttribute"],
+ time_series=raw,
+ add_mean_min=["erroneousAttribute"],
)
np.testing.assert_raises_regex(
ValueError,
- 'erroneousAttribute listed in "addMeanMax" does not occur as timeSeries column',
+ 'erroneousAttribute listed in "add_mean_max" does not occur as time_series column',
tsam.TimeSeriesAggregation,
- timeSeries=raw,
- addMeanMax=["erroneousAttribute"],
+ time_series=raw,
+ add_mean_max=["erroneousAttribute"],
)
# check error message for missing datetime index and missing resolution argument
@@ -58,7 +57,7 @@ def test_assert_raises():
"'resolution' argument has to be nonnegative float or int or the given "
"timeseries needs a datetime index",
tsam.TimeSeriesAggregation,
- timeSeries=raw.reset_index(),
+ time_series=raw.reset_index(),
)
# overwrite one of the datetime-like string indices in the raw data to an index that cannot be converted
rawErrInd = copy.deepcopy(raw)
@@ -72,158 +71,160 @@ def test_assert_raises():
"'resolution' argument has to be nonnegative float or int or the given "
"timeseries needs a datetime index",
tsam.TimeSeriesAggregation,
- timeSeries=rawErrInd,
+ time_series=rawErrInd,
)
# check erroneous resolution argument
np.testing.assert_raises_regex(
ValueError,
"resolution has to be nonnegative float or int",
tsam.TimeSeriesAggregation,
- timeSeries=raw,
+ time_series=raw,
resolution="erroneousResolution",
)
- # check erroneous hoursPerPeriod argument
+ # check erroneous hours_per_period argument
np.testing.assert_raises_regex(
ValueError,
- "hoursPerPeriod has to be nonnegative float or int",
+ "hours_per_period has to be nonnegative float or int",
tsam.TimeSeriesAggregation,
- timeSeries=raw,
- hoursPerPeriod=None,
+ time_series=raw,
+ hours_per_period=None,
)
- # check erroneous noTypicalPeriods argument
+ # check erroneous no_typical_periods argument
np.testing.assert_raises_regex(
ValueError,
- "noTypicalPeriods has to be nonnegative integer",
+ "no_typical_periods has to be nonnegative integer",
tsam.TimeSeriesAggregation,
- timeSeries=raw,
- noTypicalPeriods=None,
+ time_series=raw,
+ no_typical_periods=None,
)
# check non-integer time step number per typical period
np.testing.assert_raises_regex(
ValueError,
- "The combination of hoursPerPeriod and the resulution does not result in an integer "
+ "The combination of hours_per_period and the "
+ "resolution does not result in an integer "
"number of time steps per period",
tsam.TimeSeriesAggregation,
- timeSeries=raw,
- hoursPerPeriod=23,
+ time_series=raw,
+ hours_per_period=23,
resolution=2,
)
# check warning when number of segments per period is higher than the number of time steps per period
- with pytest.warns(Warning):
- tsam.TimeSeriesAggregation(
- timeSeries=raw,
- segmentation=True,
- noSegments=25,
- )
+ np.testing.assert_warns(
+ Warning,
+ tsam.TimeSeriesAggregation,
+ time_series=raw,
+ segmentation=True,
+ no_segments=25,
+ )
- # check erroneous clusterMethod argument
+ # check erroneous cluster_method argument
np.testing.assert_raises_regex(
ValueError,
- r"clusterMethod needs to be one of the following: \['averaging', 'k_means', "
+ r"cluster_method needs to be one of the following: \['averaging', 'k_means', "
r"'k_medoids', 'k_maxoids', 'hierarchical', 'adjacent_periods'\]",
tsam.TimeSeriesAggregation,
- timeSeries=raw,
- clusterMethod="erroneousClusterMethod",
+ time_series=raw,
+ cluster_method="erroneousClusterMethod",
)
- # check erroneous representationMethod argument
+ # check erroneous representation_method argument
np.testing.assert_raises(
ValueError,
tsam.TimeSeriesAggregation,
- timeSeries=raw,
- representationMethod="erroneousRepresentationMethod",
+ time_series=raw,
+ representation_method="erroneousRepresentationMethod",
)
- # check erroneous extremePeriodMethod argument
+ # check erroneous extreme_period_method argument
np.testing.assert_raises_regex(
ValueError,
- r"extremePeriodMethod needs to be one of the following: \['None', 'append', "
+ r"extreme_period_method needs to be one of the following: \['None', 'append', "
r"'new_cluster_center', 'replace_cluster_center'\]",
tsam.TimeSeriesAggregation,
- timeSeries=raw,
- extremePeriodMethod="erroneousExtremePeriodMethod",
+ time_series=raw,
+ extreme_period_method="erroneousExtremePeriodMethod",
)
- # check erroneous evalSumPeriods argument
+ # check erroneous eval_sum_periods argument
np.testing.assert_raises_regex(
ValueError,
- "evalSumPeriods has to be boolean",
+ "eval_sum_periods has to be boolean",
tsam.TimeSeriesAggregation,
- timeSeries=raw,
- evalSumPeriods="erroneousEvalSumPeriods",
+ time_series=raw,
+ eval_sum_periods="erroneousEvalSumPeriods",
)
- # check erroneous sortValues argument
+ # check erroneous sort_values argument
np.testing.assert_raises_regex(
ValueError,
- "sortValues has to be boolean",
+ "sort_values has to be boolean",
tsam.TimeSeriesAggregation,
- timeSeries=raw,
- sortValues="erroneousSortValues",
+ time_series=raw,
+ sort_values="erroneousSortValues",
)
- # check erroneous sameMean argument
+ # check erroneous same_mean argument
np.testing.assert_raises_regex(
ValueError,
- "sameMean has to be boolean",
+ "same_mean has to be boolean",
tsam.TimeSeriesAggregation,
- timeSeries=raw,
- sameMean="erroneousSameMean",
+ time_series=raw,
+ same_mean="erroneousSameMean",
)
- # check erroneous rescaleClusterPeriods argument
+ # check erroneous rescale_cluster_periods argument
np.testing.assert_raises_regex(
ValueError,
- "rescaleClusterPeriods has to be boolean",
+ "rescale_cluster_periods has to be boolean",
tsam.TimeSeriesAggregation,
- timeSeries=raw,
- rescaleClusterPeriods="erroneousrescaleClusterPeriods",
+ time_series=raw,
+ rescale_cluster_periods="erroneousrescaleClusterPeriods",
)
- # check erroneous predefClusterOrder argument
+ # check erroneous predef_cluster_order argument
np.testing.assert_raises_regex(
ValueError,
- "predefClusterOrder has to be an array or list",
+ "predef_cluster_order has to be an array or list",
tsam.TimeSeriesAggregation,
- timeSeries=raw,
- predefClusterOrder="erroneousPredefClusterOrder",
+ time_series=raw,
+ predef_cluster_order="erroneousPredefClusterOrder",
)
# get a cluster order from a preceding clustering run
- aggregation = tsam.TimeSeriesAggregation(timeSeries=raw)
- periodOrder = aggregation.clusterOrder
- # check erroneous predefClusterCenterIndices argument
+ aggregation = tsam.TimeSeriesAggregation(time_series=raw)
+ periodOrder = aggregation.cluster_order
+ # check erroneous predef_cluster_center_indices argument
np.testing.assert_raises_regex(
ValueError,
- "predefClusterCenterIndices has to be an array or list",
+ "predef_cluster_center_indices has to be an array or list",
tsam.TimeSeriesAggregation,
- timeSeries=raw,
- predefClusterOrder=periodOrder,
- predefClusterCenterIndices="erroneousPredefClusterCenterIndices",
+ time_series=raw,
+ predef_cluster_order=periodOrder,
+ predef_cluster_center_indices="erroneousPredefClusterCenterIndices",
)
- # check error, when predefClusterCenterIndices are defined but not predefClusterOrder
+ # check error, when predef_cluster_center_indices are defined but not predef_cluster_order
np.testing.assert_raises_regex(
ValueError,
- 'If "predefClusterCenterIndices" is defined, "predefClusterOrder" needs to be '
+ 'If "predef_cluster_center_indices" is defined, "predef_cluster_order" needs to be '
"defined as well",
tsam.TimeSeriesAggregation,
- timeSeries=raw,
- predefClusterCenterIndices="erroneousPredefClusterCenterIndices",
+ time_series=raw,
+ predef_cluster_center_indices="erroneousPredefClusterCenterIndices",
)
# check erroneous dataframe containing NaN values
rawNan = copy.deepcopy(raw)
rawNan.iloc[10, :] = np.nan
- aggregation = tsam.TimeSeriesAggregation(timeSeries=rawNan)
+ aggregation = tsam.TimeSeriesAggregation(time_series=rawNan)
np.testing.assert_raises_regex(
ValueError,
- "Pre processed data includes NaN. Please check the timeSeries input data.",
- aggregation.createTypicalPeriods,
+ "Pre processed data includes NaN. Please check the time_series input data.",
+ aggregation.create_typical_periods,
)
diff --git a/test/test_averaging.py b/test/test_averaging.py
index 0b5c899a..3900427b 100644
--- a/test/test_averaging.py
+++ b/test/test_averaging.py
@@ -10,42 +10,42 @@
def test_averaging():
raw = pd.read_csv(TESTDATA_CSV, index_col=0)
- noTypicalPeriods = 8
+ no_typical_periods = 8
- hoursPerPeriod = 24
+ hours_per_period = 24
starttime = time.time()
aggregation = tsam.TimeSeriesAggregation(
raw,
- noTypicalPeriods=noTypicalPeriods,
- hoursPerPeriod=hoursPerPeriod,
- clusterMethod="averaging",
- representationMethod="meanRepresentation",
+ no_typical_periods=no_typical_periods,
+ hours_per_period=hours_per_period,
+ cluster_method="averaging",
+ representation_method="meanRepresentation",
)
- typPeriods = aggregation.createTypicalPeriods()
+ typPeriods = aggregation.create_typical_periods()
print("Clustering took " + str(time.time() - starttime))
- # check whether the clusterOrder consists of noTypicalPeriods blocks of the same number
+ # check whether the cluster_order consists of no_typical_periods blocks of the same number
np.testing.assert_array_almost_equal(
- np.size(np.where(np.diff(aggregation.clusterOrder) != 0)),
- noTypicalPeriods - 1,
+ np.size(np.where(np.diff(aggregation.cluster_order) != 0)),
+ no_typical_periods - 1,
decimal=4,
)
# check whether the cluster centers are in line with the average of the candidates assigned to the different
# clusters
- for i in range(noTypicalPeriods):
+ for i in range(no_typical_periods):
calc = (
- tsam.unstackToPeriods(raw, hoursPerPeriod)[0]
- .loc[np.where(aggregation.clusterOrder == i)]
+ tsam.unstack_to_periods(raw, hours_per_period)[0]
+ .loc[np.where(aggregation.cluster_order == i)]
.mean(axis=0)
.to_frame()
.values
)
- orig = tsam.unstackToPeriods(typPeriods.loc[i], hoursPerPeriod)[0].T.values
+ orig = tsam.unstack_to_periods(typPeriods.loc[i], hours_per_period)[0].T.values
np.testing.assert_array_almost_equal(calc, orig, decimal=4)
diff --git a/test/test_backward_compat.py b/test/test_backward_compat.py
new file mode 100644
index 00000000..d9235c32
--- /dev/null
+++ b/test/test_backward_compat.py
@@ -0,0 +1,114 @@
+"""Tests that deprecated camelCase names still work for backward compatibility."""
+
+import warnings
+
+import pandas as pd
+import pytest
+
+from conftest import TESTDATA_CSV
+
+
+@pytest.fixture
+def raw():
+ return pd.read_csv(TESTDATA_CSV, index_col=0)
+
+
+class TestTimeSeriesAggregationCompat:
+ """Verify old camelCase kwargs, methods, and properties still work."""
+
+ def test_old_kwargs_accepted_with_warning(self, raw):
+ with warnings.catch_warnings(record=True) as w:
+ warnings.simplefilter("always")
+ from tsam.timeseriesaggregation import TimeSeriesAggregation
+
+ agg = TimeSeriesAggregation(
+ timeSeries=raw,
+ noTypicalPeriods=8,
+ hoursPerPeriod=24,
+ clusterMethod="hierarchical",
+ )
+
+ future_warnings = [x for x in w if issubclass(x.category, FutureWarning)]
+ # Should have warnings for timeSeries, noTypicalPeriods, hoursPerPeriod, clusterMethod
+ assert len(future_warnings) >= 4
+ assert agg.no_typical_periods == 8
+ assert agg.hours_per_period == 24
+ assert agg.cluster_method == "hierarchical"
+
+ def test_unexpected_kwarg_raises(self, raw):
+ from tsam.timeseriesaggregation import TimeSeriesAggregation
+
+ with pytest.raises(TypeError, match="Unexpected keyword arguments"):
+ TimeSeriesAggregation(raw, bogusKwarg=42)
+
+ def test_old_method_names(self, raw):
+ from tsam.timeseriesaggregation import TimeSeriesAggregation
+
+ with warnings.catch_warnings():
+ warnings.simplefilter("ignore")
+ agg = TimeSeriesAggregation(raw, noTypicalPeriods=8)
+
+ # Old method names should work
+ typical = agg.createTypicalPeriods()
+ assert typical is not None
+
+ predicted = agg.predictOriginalData()
+ assert predicted is not None
+
+ acc = agg.accuracyIndicators()
+ assert acc is not None
+
+ total_acc = agg.totalAccuracyIndicators()
+ assert total_acc is not None
+
+ matching = agg.indexMatching()
+ assert matching is not None
+
+ def test_old_property_names(self, raw):
+ from tsam.timeseriesaggregation import TimeSeriesAggregation
+
+ with warnings.catch_warnings():
+ warnings.simplefilter("ignore")
+ agg = TimeSeriesAggregation(raw, noTypicalPeriods=8)
+ agg.createTypicalPeriods()
+
+ # Old property names should return the same as new ones
+ assert agg.stepIdx == agg.step_idx
+ assert list(agg.clusterPeriodIdx) == list(agg.cluster_period_idx)
+ assert list(agg.clusterOrder) == list(agg.cluster_order)
+ assert agg.clusterPeriodNoOccur == agg.cluster_period_no_occur
+ assert agg.clusterPeriodDict == agg.cluster_period_dict
+
+
+class TestHyperparameterTuningCompat:
+ """Verify old camelCase function and method names still work."""
+
+ def test_old_function_aliases(self):
+ from tsam.hyperparametertuning import (
+ get_no_periods_for_data_reduction,
+ get_no_segments_for_data_reduction,
+ getNoPeriodsForDataReduction,
+ getNoSegmentsForDataReduction,
+ )
+
+ with warnings.catch_warnings():
+ warnings.simplefilter("ignore")
+ assert getNoPeriodsForDataReduction(
+ 8760, 24, 0.1
+ ) == get_no_periods_for_data_reduction(8760, 24, 0.1)
+ assert getNoSegmentsForDataReduction(
+ 8760, 10, 0.1
+ ) == get_no_segments_for_data_reduction(8760, 10, 0.1)
+
+ def test_save_aggregation_history_old_kwarg(self, raw):
+ from tsam.hyperparametertuning import HyperTunedAggregations
+ from tsam.timeseriesaggregation import TimeSeriesAggregation
+
+ with warnings.catch_warnings(record=True) as w:
+ warnings.simplefilter("always")
+ base = TimeSeriesAggregation(raw, no_typical_periods=8, segmentation=True)
+ agg = HyperTunedAggregations(base, saveAggregationHistory=False)
+
+ future_warnings = [x for x in w if issubclass(x.category, FutureWarning)]
+ assert len(future_warnings) >= 1
+ assert agg.save_aggregation_history is False
diff --git a/test/test_cluster_order.py b/test/test_cluster_order.py
index 42e03c84..25925dd1 100644
--- a/test/test_cluster_order.py
+++ b/test/test_cluster_order.py
@@ -26,39 +26,39 @@ def test_cluster_order():
aggregation_wind = tsam.TimeSeriesAggregation(
raw_wind,
- noTypicalPeriods=8,
- hoursPerPeriod=24,
- clusterMethod="hierarchical",
- representationMethod="meanRepresentation",
+ no_typical_periods=8,
+ hours_per_period=24,
+ cluster_method="hierarchical",
+ representation_method="meanRepresentation",
)
- typPeriods_wind = aggregation_wind.createTypicalPeriods()
+ typPeriods_wind = aggregation_wind.create_typical_periods()
aggregation_predefClusterOrder = tsam.TimeSeriesAggregation(
raw,
- noTypicalPeriods=8,
- hoursPerPeriod=24,
- clusterMethod="hierarchical",
- representationMethod="meanRepresentation",
- predefClusterOrder=aggregation_wind.clusterOrder,
+ no_typical_periods=8,
+ hours_per_period=24,
+ cluster_method="hierarchical",
+ representation_method="meanRepresentation",
+ predef_cluster_order=aggregation_wind.cluster_order,
)
typPeriods_predefClusterOrder = (
- aggregation_predefClusterOrder.createTypicalPeriods()
+ aggregation_predefClusterOrder.create_typical_periods()
)
aggregation_predefClusterOrderAndClusterCenters = tsam.TimeSeriesAggregation(
raw,
- noTypicalPeriods=8,
- hoursPerPeriod=24,
- clusterMethod="hierarchical",
- representationMethod="meanRepresentation",
- predefClusterOrder=aggregation_wind.clusterOrder,
- predefClusterCenterIndices=aggregation_wind.clusterCenterIndices,
+ no_typical_periods=8,
+ hours_per_period=24,
+ cluster_method="hierarchical",
+ representation_method="meanRepresentation",
+ predef_cluster_order=aggregation_wind.cluster_order,
+ predef_cluster_center_indices=aggregation_wind.cluster_center_indices,
)
typPeriods_predefClusterOrderAndClusterCenters = (
- aggregation_predefClusterOrderAndClusterCenters.createTypicalPeriods()
+ aggregation_predefClusterOrderAndClusterCenters.create_typical_periods()
)
print("Clustering took " + str(time.time() - starttime))
diff --git a/test/test_clustering_e2e.py b/test/test_clustering_e2e.py
index 958e0472..efe5da8c 100644
--- a/test/test_clustering_e2e.py
+++ b/test/test_clustering_e2e.py
@@ -279,7 +279,7 @@ def test_cluster_weights(
# Compare cluster weights (sum should match)
expected_weights = metadata["cluster_weights"]
- actual_weights = result.cluster_weights
+ actual_weights = result.cluster_counts
# Total weight should match number of original periods
expected_total = sum(expected_weights.values())
@@ -504,7 +504,7 @@ def generate_fixtures(output_dir: Path | None = None):
"n_clusters": 8,
},
"cluster_weights": {
- str(k): int(v) for k, v in result.cluster_weights.items()
+ str(k): int(v) for k, v in result.cluster_counts.items()
},
"accuracy": {
"rmse": {col: float(val) for col, val in result.accuracy.rmse.items()},
diff --git a/test/test_durationCurve.py b/test/test_durationCurve.py
index 9cb19d4b..265a32cb 100644
--- a/test/test_durationCurve.py
+++ b/test/test_durationCurve.py
@@ -12,33 +12,35 @@ def test_durationCurve():
# do everything for one attribute only to make sure that scaling does not play a role
raw = pd.read_csv(TESTDATA_CSV, index_col=0)["GHI"].to_frame()
- noTypicalPeriods = 8
+ no_typical_periods = 8
- hoursPerPeriod = 24
+ hours_per_period = 24
starttime = time.time()
aggregation = tsam.TimeSeriesAggregation(
raw,
- noTypicalPeriods=noTypicalPeriods,
- hoursPerPeriod=hoursPerPeriod,
- clusterMethod="hierarchical",
- sortValues=True,
- rescaleClusterPeriods=False,
+ no_typical_periods=no_typical_periods,
+ hours_per_period=hours_per_period,
+ cluster_method="hierarchical",
+ sort_values=True,
+ rescale_cluster_periods=False,
)
- typPeriods = aggregation.createTypicalPeriods()
+ typPeriods = aggregation.create_typical_periods()
print("Clustering took " + str(time.time() - starttime))
# sort every attribute in every period in descending order for both, the found typical period and the days
# that belong to the corresponding cluster
- for i in range(noTypicalPeriods):
- calculated = tsam.unstackToPeriods(raw, hoursPerPeriod)[0].loc[
- np.where(aggregation.clusterOrder == i)[0], :
+ for i in range(no_typical_periods):
+ calculated = tsam.unstack_to_periods(raw, hours_per_period)[0].loc[
+ np.where(aggregation.cluster_order == i)[0], :
]
calculatedSorted = copy.deepcopy(calculated)
- algorithmResult = tsam.unstackToPeriods(typPeriods.loc[i], hoursPerPeriod)[0]
+ algorithmResult = tsam.unstack_to_periods(typPeriods.loc[i], hours_per_period)[
+ 0
+ ]
for j in raw.columns:
dfR = algorithmResult[j]
dfR[dfR.columns] = np.sort(dfR)[:, ::-1]
diff --git a/test/test_durationRepresentation.py b/test/test_durationRepresentation.py
index 2dfd49af..ca8f5a34 100644
--- a/test/test_durationRepresentation.py
+++ b/test/test_durationRepresentation.py
@@ -29,14 +29,14 @@ def test_durationRepresentation():
aggregation1 = tsam.TimeSeriesAggregation(
raw,
- noTypicalPeriods=8,
- hoursPerPeriod=24,
- sortValues=False,
- clusterMethod="k_means",
- rescaleClusterPeriods=False,
+ no_typical_periods=8,
+ hours_per_period=24,
+ sort_values=False,
+ cluster_method="k_means",
+ rescale_cluster_periods=False,
)
- predictedPeriods1 = aggregation1.predictOriginalData()
+ predictedPeriods1 = aggregation1.predict_original_data()
print("Clustering took " + str(time.time() - starttime))
@@ -44,15 +44,15 @@ def test_durationRepresentation():
aggregation2 = tsam.TimeSeriesAggregation(
raw,
- noTypicalPeriods=8,
- hoursPerPeriod=24,
- sortValues=False,
- clusterMethod="k_means",
- representationMethod="durationRepresentation",
- rescaleClusterPeriods=False,
+ no_typical_periods=8,
+ hours_per_period=24,
+ sort_values=False,
+ cluster_method="k_means",
+ representation_method="durationRepresentation",
+ rescale_cluster_periods=False,
)
- predictedPeriods2 = aggregation2.predictOriginalData()
+ predictedPeriods2 = aggregation2.predict_original_data()
print("Clustering took " + str(time.time() - starttime))
@@ -60,33 +60,33 @@ def test_durationRepresentation():
aggregation3 = tsam.TimeSeriesAggregation(
raw,
- noTypicalPeriods=8,
- hoursPerPeriod=24,
- sortValues=False,
- clusterMethod="k_means",
- representationMethod="durationRepresentation",
- distributionPeriodWise=False,
- rescaleClusterPeriods=False,
+ no_typical_periods=8,
+ hours_per_period=24,
+ sort_values=False,
+ cluster_method="k_means",
+ representation_method="durationRepresentation",
+ distribution_period_wise=False,
+ rescale_cluster_periods=False,
)
- predictedPeriods3 = aggregation3.predictOriginalData()
+ predictedPeriods3 = aggregation3.predict_original_data()
print("Clustering took " + str(time.time() - starttime))
# make sure that the sum of the attribute specific RMSEs is smaller for the k-means clustering with centroid
# representation than for the duration curve representation
np.testing.assert_array_less(
- aggregation1.accuracyIndicators().loc[:, "RMSE"].sum(),
- aggregation3.accuracyIndicators().loc[:, "RMSE"].sum(),
- aggregation2.accuracyIndicators().loc[:, "RMSE"].sum(),
+ aggregation1.accuracy_indicators().loc[:, "RMSE"].sum(),
+ aggregation3.accuracy_indicators().loc[:, "RMSE"].sum(),
+ aggregation2.accuracy_indicators().loc[:, "RMSE"].sum(),
)
# make sure that the sum of the attribute specific duration curve RMSEs is smaller for the k-means clustering with
# duration curve representation than for the centroid representation
np.testing.assert_array_less(
- aggregation3.accuracyIndicators().loc[:, "RMSE_duration"].sum(),
- aggregation2.accuracyIndicators().loc[:, "RMSE_duration"].sum(),
- aggregation1.accuracyIndicators().loc[:, "RMSE_duration"].sum(),
+ aggregation3.accuracy_indicators().loc[:, "RMSE_duration"].sum(),
+ aggregation2.accuracy_indicators().loc[:, "RMSE_duration"].sum(),
+ aggregation1.accuracy_indicators().loc[:, "RMSE_duration"].sum(),
)
@@ -96,18 +96,18 @@ def test_distributionMinMaxRepresentation():
aggregation = tsam.TimeSeriesAggregation(
raw,
- noTypicalPeriods=24,
+ no_typical_periods=24,
segmentation=True,
- noSegments=8,
- hoursPerPeriod=24,
- sortValues=False,
- clusterMethod="hierarchical",
- representationMethod="distributionAndMinMaxRepresentation",
- distributionPeriodWise=False,
- rescaleClusterPeriods=False,
+ no_segments=8,
+ hours_per_period=24,
+ sort_values=False,
+ cluster_method="hierarchical",
+ representation_method="distributionAndMinMaxRepresentation",
+ distribution_period_wise=False,
+ rescale_cluster_periods=False,
)
- predictedPeriods = aggregation.predictOriginalData()
+ predictedPeriods = aggregation.predict_original_data()
# make sure that max and min of the newly predicted time series are the same as
# from the original
@@ -128,18 +128,18 @@ def test_distributionRepresentation_keeps_mean():
aggregation = tsam.TimeSeriesAggregation(
raw,
- noTypicalPeriods=8,
- hoursPerPeriod=24,
+ no_typical_periods=8,
+ hours_per_period=24,
segmentation=True,
- noSegments=8,
- sortValues=False,
- clusterMethod="hierarchical",
- representationMethod="distributionRepresentation",
- distributionPeriodWise=False,
- rescaleClusterPeriods=False, # even without rescaling
+ no_segments=8,
+ sort_values=False,
+ cluster_method="hierarchical",
+ representation_method="distributionRepresentation",
+ distribution_period_wise=False,
+ rescale_cluster_periods=False, # even without rescaling
)
- predictedPeriods = aggregation.predictOriginalData()
+ predictedPeriods = aggregation.predict_original_data()
assert np.isclose(raw.mean(), predictedPeriods.mean(), atol=1e-4).all()
diff --git a/test/test_extremePeriods.py b/test/test_extremePeriods.py
index 9032fae7..194c35ca 100644
--- a/test/test_extremePeriods.py
+++ b/test/test_extremePeriods.py
@@ -6,69 +6,69 @@
def test_extremePeriods():
- hoursPerPeriod = 24
+ hours_per_period = 24
- noTypicalPeriods = 8
+ no_typical_periods = 8
raw = pd.read_csv(TESTDATA_CSV, index_col=0)
aggregation1 = tsam_legacy.TimeSeriesAggregation(
raw,
- noTypicalPeriods=noTypicalPeriods,
- hoursPerPeriod=hoursPerPeriod,
- clusterMethod="hierarchical",
- rescaleClusterPeriods=False,
- extremePeriodMethod="new_cluster_center",
- addPeakMax=["GHI"],
+ no_typical_periods=no_typical_periods,
+ hours_per_period=hours_per_period,
+ cluster_method="hierarchical",
+ rescale_cluster_periods=False,
+ extreme_period_method="new_cluster_center",
+ add_peak_max=["GHI"],
)
aggregation2 = tsam_legacy.TimeSeriesAggregation(
raw,
- noTypicalPeriods=noTypicalPeriods,
- hoursPerPeriod=hoursPerPeriod,
- clusterMethod="hierarchical",
- rescaleClusterPeriods=False,
- extremePeriodMethod="append",
- addPeakMax=["GHI"],
+ no_typical_periods=no_typical_periods,
+ hours_per_period=hours_per_period,
+ cluster_method="hierarchical",
+ rescale_cluster_periods=False,
+ extreme_period_method="append",
+ add_peak_max=["GHI"],
)
aggregation3 = tsam_legacy.TimeSeriesAggregation(
raw,
- noTypicalPeriods=noTypicalPeriods,
- hoursPerPeriod=hoursPerPeriod,
- clusterMethod="hierarchical",
- rescaleClusterPeriods=False,
- extremePeriodMethod="replace_cluster_center",
- addPeakMax=["GHI"],
+ no_typical_periods=no_typical_periods,
+ hours_per_period=hours_per_period,
+ cluster_method="hierarchical",
+ rescale_cluster_periods=False,
+ extreme_period_method="replace_cluster_center",
+ add_peak_max=["GHI"],
)
# make sure that the RMSE for new cluster centers (reassigning points to the exxtreme point if the distance to it is
# smaller)is bigger than for appending just one extreme period
np.testing.assert_array_less(
- aggregation1.accuracyIndicators().loc["GHI", "RMSE"],
- aggregation2.accuracyIndicators().loc["GHI", "RMSE"],
+ aggregation1.accuracy_indicators().loc["GHI", "RMSE"],
+ aggregation2.accuracy_indicators().loc["GHI", "RMSE"],
)
# make sure that the RMSE for appending the extreme period is smaller than for replacing the cluster center by the
# extreme period (conservative assumption)
np.testing.assert_array_less(
- aggregation2.accuracyIndicators().loc["GHI", "RMSE"],
- aggregation3.accuracyIndicators().loc["GHI", "RMSE"],
+ aggregation2.accuracy_indicators().loc["GHI", "RMSE"],
+ aggregation3.accuracy_indicators().loc["GHI", "RMSE"],
)
- # check if addMeanMax and addMeanMin are working
+ # check if add_mean_max and add_mean_min are working
aggregation4 = tsam_legacy.TimeSeriesAggregation(
raw,
- noTypicalPeriods=noTypicalPeriods,
- hoursPerPeriod=hoursPerPeriod,
- clusterMethod="hierarchical",
- rescaleClusterPeriods=False,
- extremePeriodMethod="append",
- addMeanMax=["GHI"],
- addMeanMin=["GHI"],
+ no_typical_periods=no_typical_periods,
+ hours_per_period=hours_per_period,
+ cluster_method="hierarchical",
+ rescale_cluster_periods=False,
+ extreme_period_method="append",
+ add_mean_max=["GHI"],
+ add_mean_min=["GHI"],
)
- origData = aggregation4.predictOriginalData()
+ origData = aggregation4.predict_original_data()
np.testing.assert_array_almost_equal(
raw.groupby(np.arange(len(raw)) // 24).mean().max().loc["GHI"],
diff --git a/test/test_golden_regression.py b/test/test_golden_regression.py
index 3dbb006d..6e78b2a7 100644
--- a/test/test_golden_regression.py
+++ b/test/test_golden_regression.py
@@ -97,14 +97,22 @@ class TestGoldenRegression:
@pytest.mark.parametrize("case", CASES, ids=case_ids(CASES))
def test_update_golden(self, case: EquivalenceCase, update_golden):
- """Save old-API reconstructed results as golden files (only with --update-golden)."""
+ """Save reconstructed results as golden files (only with --update-golden).
+
+ For configs with skip_equivalence (intentional old/new divergence),
+ golden is generated from the new API. Otherwise from the old API.
+ """
if not update_golden:
pytest.skip("use --update-golden to regenerate")
data = get_data(case.dataset, max_timesteps=case.max_timesteps)
- with _expected_warnings(case):
- _, old_agg = _run_old(data, case)
- _save_golden(old_agg.predictOriginalData(), case)
+ if case.skip_equivalence:
+ new_result = _run_new(data, case)
+ _save_golden(new_result.reconstructed, case)
+ else:
+ with _expected_warnings(case):
+ _, old_agg = _run_old(data, case)
+ _save_golden(old_agg.predictOriginalData(), case)
@pytest.mark.parametrize("case", CASES, ids=case_ids(CASES))
def test_new_api_matches_golden(self, case: EquivalenceCase, update_golden):
@@ -129,6 +137,7 @@ def test_new_api_matches_golden(self, case: EquivalenceCase, update_golden):
golden,
check_names=False,
check_freq=False,
+ check_like=True,
atol=1e-7,
)
@@ -137,6 +146,8 @@ def test_old_api_matches_golden(self, case: EquivalenceCase, update_golden):
"""Old API reconstructed result must match stored golden CSV."""
if update_golden:
pytest.skip("updating golden files")
+ if case.skip_equivalence:
+ pytest.skip("golden generated from new API (intentional divergence)")
path = _golden_path(case)
if not path.exists():
diff --git a/test/test_hierarchical.py b/test/test_hierarchical.py
index 2740c82a..a50678e8 100644
--- a/test/test_hierarchical.py
+++ b/test/test_hierarchical.py
@@ -19,15 +19,15 @@ def test_hierarchical():
aggregation = tsam.TimeSeriesAggregation(
raw,
- noTypicalPeriods=8,
- hoursPerPeriod=24,
- clusterMethod="hierarchical",
- extremePeriodMethod="new_cluster_center",
- addPeakMin=["T"],
- addPeakMax=["Load"],
+ no_typical_periods=8,
+ hours_per_period=24,
+ cluster_method="hierarchical",
+ extreme_period_method="new_cluster_center",
+ add_peak_min=["T"],
+ add_peak_max=["Load"],
)
- typPeriods = aggregation.createTypicalPeriods()
+ typPeriods = aggregation.create_typical_periods()
print("Clustering took " + str(time.time() - starttime))
@@ -62,15 +62,15 @@ def test_hierarchical_for_weeks():
aggregation = tsam.TimeSeriesAggregation(
raw,
- noTypicalPeriods=8,
- hoursPerPeriod=24 * 7,
- clusterMethod="hierarchical",
- extremePeriodMethod="new_cluster_center",
- addPeakMin=["T"],
- addPeakMax=["Load"],
+ no_typical_periods=8,
+ hours_per_period=24 * 7,
+ cluster_method="hierarchical",
+ extreme_period_method="new_cluster_center",
+ add_peak_min=["T"],
+ add_peak_max=["Load"],
)
- typPeriods = aggregation.createTypicalPeriods()
+ typPeriods = aggregation.create_typical_periods()
print("Clustering took " + str(time.time() - starttime))
diff --git a/test/test_hypertuneAggregation.py b/test/test_hypertuneAggregation.py
index 2de58d8d..1fe3addb 100644
--- a/test/test_hypertuneAggregation.py
+++ b/test/test_hypertuneAggregation.py
@@ -9,23 +9,23 @@
def test_getPeriodPair():
"""Tests if the number of periods is properly defined if a datareduction is set"""
- noRawTimeSteps = 100
- segmentsPerPeriod = 10
- dataReduction = 0.5
- noPeriods = tune.getNoPeriodsForDataReduction(
- noRawTimeSteps, segmentsPerPeriod, dataReduction
+ n_raw_timesteps = 100
+ segments_per_period = 10
+ data_reduction = 0.5
+ noPeriods = tune.get_no_periods_for_data_reduction(
+ n_raw_timesteps, segments_per_period, data_reduction
)
assert noPeriods == 5
- noRawTimeSteps = 101
- noPeriods = tune.getNoPeriodsForDataReduction(
- noRawTimeSteps, segmentsPerPeriod, dataReduction
+ n_raw_timesteps = 101
+ noPeriods = tune.get_no_periods_for_data_reduction(
+ n_raw_timesteps, segments_per_period, data_reduction
)
assert noPeriods == 5
- segmentsPerPeriod = 2
- noPeriods = tune.getNoPeriodsForDataReduction(
- noRawTimeSteps, segmentsPerPeriod, dataReduction
+ segments_per_period = 2
+ noPeriods = tune.get_no_periods_for_data_reduction(
+ n_raw_timesteps, segments_per_period, data_reduction
)
assert noPeriods == 25
@@ -39,19 +39,19 @@ def test_optimalPair():
aggregation_wind = tune.HyperTunedAggregations(
tsam.TimeSeriesAggregation(
raw.loc[:, ["Wind"]],
- hoursPerPeriod=24,
- clusterMethod="hierarchical",
- representationMethod="durationRepresentation",
- distributionPeriodWise=False,
- rescaleClusterPeriods=False,
+ hours_per_period=24,
+ cluster_method="hierarchical",
+ representation_method="durationRepresentation",
+ distribution_period_wise=False,
+ rescale_cluster_periods=False,
segmentation=True,
)
)
# and identify the best combination for a data reduction of to ~10%.
windSegments, windPeriods, _windRMSE = (
- aggregation_wind.identifyOptimalSegmentPeriodCombination(
- dataReduction=datareduction
+ aggregation_wind.identify_optimal_segment_period_combination(
+ data_reduction=datareduction
)
)
@@ -59,19 +59,19 @@ def test_optimalPair():
aggregation_solar = tune.HyperTunedAggregations(
tsam.TimeSeriesAggregation(
raw.loc[:, ["GHI"]],
- hoursPerPeriod=24,
- clusterMethod="hierarchical",
- representationMethod="durationRepresentation",
- distributionPeriodWise=False,
- rescaleClusterPeriods=False,
+ hours_per_period=24,
+ cluster_method="hierarchical",
+ representation_method="durationRepresentation",
+ distribution_period_wise=False,
+ rescale_cluster_periods=False,
segmentation=True,
)
)
# and identify the best combination for a data reduction of to ~10%.
solarSegments, solarPeriods, _solarRMSE = (
- aggregation_solar.identifyOptimalSegmentPeriodCombination(
- dataReduction=datareduction
+ aggregation_solar.identify_optimal_segment_period_combination(
+ data_reduction=datareduction
)
)
@@ -101,40 +101,40 @@ def test_steepest_gradient_leads_to_optima():
tunedAggregations = tune.HyperTunedAggregations(
tsam.TimeSeriesAggregation(
raw,
- hoursPerPeriod=24,
- clusterMethod="hierarchical",
- representationMethod="meanRepresentation",
- rescaleClusterPeriods=False,
+ hours_per_period=24,
+ cluster_method="hierarchical",
+ representation_method="meanRepresentation",
+ rescale_cluster_periods=False,
segmentation=True,
)
)
# and identify the best combination for a data reduction.
_segmentsOpt, _periodsOpt, RMSEOpt = (
- tunedAggregations.identifyOptimalSegmentPeriodCombination(
- dataReduction=datareduction
+ tunedAggregations.identify_optimal_segment_period_combination(
+ data_reduction=datareduction
)
)
# test steepest
- tunedAggregations.identifyParetoOptimalAggregation(
- untilTotalTimeSteps=365 * SEGMENTS_TESTED
+ tunedAggregations.identify_pareto_optimal_aggregation(
+ until_total_timesteps=365 * SEGMENTS_TESTED
)
- steepestAggregation = tunedAggregations.aggregationHistory[-1]
- RMSEsteepest = steepestAggregation.totalAccuracyIndicators()["RMSE"]
+ steepestAggregation = tunedAggregations.aggregation_history[-1]
+ RMSEsteepest = steepestAggregation.total_accuracy_indicators()["RMSE"]
# only segments
aggregation = tsam.TimeSeriesAggregation(
raw,
- noTypicalPeriods=365,
- hoursPerPeriod=24,
+ no_typical_periods=365,
+ hours_per_period=24,
segmentation=True,
- noSegments=SEGMENTS_TESTED,
- clusterMethod="hierarchical",
- representationMethod="meanRepresentation",
+ no_segments=SEGMENTS_TESTED,
+ cluster_method="hierarchical",
+ representation_method="meanRepresentation",
)
- RMSESegments = aggregation.totalAccuracyIndicators()["RMSE"]
+ RMSESegments = aggregation.total_accuracy_indicators()["RMSE"]
assert RMSEsteepest < RMSESegments
@@ -151,24 +151,24 @@ def test_paretoOptimalAggregation():
tunedAggregations = tune.HyperTunedAggregations(
tsam.TimeSeriesAggregation(
raw,
- hoursPerPeriod=12,
- clusterMethod="hierarchical",
- representationMethod="meanRepresentation",
- distributionPeriodWise=False,
- rescaleClusterPeriods=False,
+ hours_per_period=12,
+ cluster_method="hierarchical",
+ representation_method="meanRepresentation",
+ distribution_period_wise=False,
+ rescale_cluster_periods=False,
segmentation=True,
)
)
# determine pareto optimal aggregation
- tunedAggregations.identifyParetoOptimalAggregation()
+ tunedAggregations.identify_pareto_optimal_aggregation()
# test if last RMSE is 0
- assert tunedAggregations._RMSEHistory[-1] == 0
+ assert tunedAggregations._rmse_history[-1] == 0
# test if RMSE is continously decreasing
- for i, RMSE in enumerate(tunedAggregations._RMSEHistory[1:]):
- assert RMSE <= tunedAggregations._RMSEHistory[i]
+ for i, RMSE in enumerate(tunedAggregations._rmse_history[1:]):
+ assert RMSE <= tunedAggregations._rmse_history[i]
if __name__ == "__main__":
diff --git a/test/test_k_maxoids.py b/test/test_k_maxoids.py
index 573f570d..e27fa8e7 100644
--- a/test/test_k_maxoids.py
+++ b/test/test_k_maxoids.py
@@ -26,13 +26,13 @@ def test_k_maxoids():
aggregation1 = tsam.TimeSeriesAggregation(
raw,
- noTypicalPeriods=8,
- hoursPerPeriod=24,
- clusterMethod="k_means",
- rescaleClusterPeriods=False,
+ no_typical_periods=8,
+ hours_per_period=24,
+ cluster_method="k_means",
+ rescale_cluster_periods=False,
)
- predictedPeriods1 = aggregation1.predictOriginalData()
+ predictedPeriods1 = aggregation1.predict_original_data()
print("Clustering took " + str(time.time() - starttime))
@@ -40,13 +40,13 @@ def test_k_maxoids():
aggregation2 = tsam.TimeSeriesAggregation(
raw,
- noTypicalPeriods=8,
- hoursPerPeriod=24,
- clusterMethod="k_maxoids",
- rescaleClusterPeriods=False,
+ no_typical_periods=8,
+ hours_per_period=24,
+ cluster_method="k_maxoids",
+ rescale_cluster_periods=False,
)
- predictedPeriods2 = aggregation2.predictOriginalData()
+ predictedPeriods2 = aggregation2.predict_original_data()
print("Clustering took " + str(time.time() - starttime))
diff --git a/test/test_k_medoids.py b/test/test_k_medoids.py
index 5a4e4aa5..781d8d72 100644
--- a/test/test_k_medoids.py
+++ b/test/test_k_medoids.py
@@ -24,7 +24,7 @@ def test_k_medoids():
clusterMethod="k_medoids",
)
- typPeriods = aggregation.createTypicalPeriods()
+ typPeriods = aggregation.create_typical_periods()
print("Clustering took " + str(time.time() - starttime))
diff --git a/test/test_minmaxRepresentation.py b/test/test_minmaxRepresentation.py
index 77b40288..93a18432 100644
--- a/test/test_minmaxRepresentation.py
+++ b/test/test_minmaxRepresentation.py
@@ -10,9 +10,9 @@
def test_minmaxRepresentation():
raw = pd.read_csv(TESTDATA_CSV, index_col=0)
- noTypicalPeriods = 8
+ no_typical_periods = 8
- hoursPerPeriod = 24
+ hours_per_period = 24
representationDict = {"GHI": "max", "T": "min", "Wind": "mean", "Load": "min"}
@@ -22,38 +22,38 @@ def test_minmaxRepresentation():
aggregation = tsam.TimeSeriesAggregation(
raw,
- noTypicalPeriods=noTypicalPeriods,
- hoursPerPeriod=hoursPerPeriod,
- clusterMethod="hierarchical",
- rescaleClusterPeriods=False,
- representationMethod="minmaxmeanRepresentation",
- representationDict=representationDict,
+ no_typical_periods=no_typical_periods,
+ hours_per_period=hours_per_period,
+ cluster_method="hierarchical",
+ rescale_cluster_periods=False,
+ representation_method="minmaxmeanRepresentation",
+ representation_dict=representationDict,
)
- typPeriods = aggregation.createTypicalPeriods()
+ typPeriods = aggregation.create_typical_periods()
print("Clustering took " + str(time.time() - starttime))
- for i in range(noTypicalPeriods):
+ for i in range(no_typical_periods):
for j in representationDict:
if representationDict[j] == "min":
calculated = (
- tsam.unstackToPeriods(raw, hoursPerPeriod)[0]
- .loc[np.where(aggregation.clusterOrder == i)[0], j]
+ tsam.unstack_to_periods(raw, hours_per_period)[0]
+ .loc[np.where(aggregation.cluster_order == i)[0], j]
.min()
.values
)
elif representationDict[j] == "max":
calculated = (
- tsam.unstackToPeriods(raw, hoursPerPeriod)[0]
- .loc[np.where(aggregation.clusterOrder == i)[0], j]
+ tsam.unstack_to_periods(raw, hours_per_period)[0]
+ .loc[np.where(aggregation.cluster_order == i)[0], j]
.max()
.values
)
elif representationDict[j] == "mean":
calculated = (
- tsam.unstackToPeriods(raw, hoursPerPeriod)[0]
- .loc[np.where(aggregation.clusterOrder == i)[0], j]
+ tsam.unstack_to_periods(raw, hours_per_period)[0]
+ .loc[np.where(aggregation.cluster_order == i)[0], j]
.mean()
.values
)
diff --git a/test/test_new_api.py b/test/test_new_api.py
index 69822526..16b6fea8 100644
--- a/test/test_new_api.py
+++ b/test/test_new_api.py
@@ -27,7 +27,7 @@ def test_basic_aggregation(self, sample_data):
assert result.cluster_representatives is not None
assert result.n_clusters == 8
- assert len(result.cluster_weights) == 8
+ assert len(result.cluster_counts) == 8
assert result.accuracy is not None
def test_with_cluster_config(self, sample_data):
diff --git a/test/test_preprocess.py b/test/test_preprocess.py
deleted file mode 100644
index 3202895b..00000000
--- a/test/test_preprocess.py
+++ /dev/null
@@ -1,31 +0,0 @@
-import numpy as np
-import pandas as pd
-
-import tsam.timeseriesaggregation as tsam
-from conftest import RESULTS_DIR, TESTDATA_CSV
-
-
-def test_preprocess():
- raw = pd.read_csv(TESTDATA_CSV, index_col=0)
-
- raw_wind = raw.loc[:, "Wind"].to_frame()
-
- aggregation_wind = tsam.TimeSeriesAggregation(
- raw_wind, noTypicalPeriods=8, hoursPerPeriod=24, clusterMethod="hierarchical"
- )
-
- aggregation_wind._preProcessTimeSeries()
-
- test = aggregation_wind.normalizedPeriodlyProfiles
-
- orig = pd.read_csv(
- RESULTS_DIR / "preprocessed_wind.csv",
- index_col=[0],
- header=[0, 1],
- )
-
- np.testing.assert_array_almost_equal(test.values, orig.values, decimal=15)
-
-
-if __name__ == "__main__":
- test_preprocess()
diff --git a/test/test_properties.py b/test/test_properties.py
index 9bbbb953..5576f50a 100644
--- a/test/test_properties.py
+++ b/test/test_properties.py
@@ -12,11 +12,11 @@
@pytest.mark.filterwarnings("ignore:Segmentation is turned off:UserWarning")
def test_properties():
- hoursPerPeriod = 24
+ hours_per_period = 24
- noSegments = 8
+ no_segments = 8
- noTypicalPeriods = 8
+ no_typical_periods = 8
raw = pd.read_csv(TESTDATA_CSV, index_col=0)
@@ -24,69 +24,69 @@ def test_properties():
aggregation1 = tsam.TimeSeriesAggregation(
raw,
- noTypicalPeriods=noTypicalPeriods,
- hoursPerPeriod=hoursPerPeriod,
- clusterMethod="hierarchical",
+ no_typical_periods=no_typical_periods,
+ hours_per_period=hours_per_period,
+ cluster_method="hierarchical",
segmentation=True,
- noSegments=noSegments,
+ no_segments=no_segments,
)
print("Clustering took " + str(time.time() - starttime))
np.testing.assert_array_almost_equal(
- aggregation1.stepIdx, np.arange(noSegments), decimal=4
+ aggregation1.step_idx, np.arange(no_segments), decimal=4
)
starttime = time.time()
aggregation2 = tsam.TimeSeriesAggregation(
raw,
- noTypicalPeriods=noTypicalPeriods,
- hoursPerPeriod=hoursPerPeriod,
- clusterMethod="hierarchical",
+ no_typical_periods=no_typical_periods,
+ hours_per_period=hours_per_period,
+ cluster_method="hierarchical",
)
print("Clustering took " + str(time.time() - starttime))
np.testing.assert_array_almost_equal(
- aggregation2.stepIdx, np.arange(hoursPerPeriod), decimal=4
+ aggregation2.step_idx, np.arange(hours_per_period), decimal=4
)
starttime = time.time()
aggregation3 = tsam.TimeSeriesAggregation(
raw,
- noTypicalPeriods=noTypicalPeriods,
- hoursPerPeriod=hoursPerPeriod,
- clusterMethod="hierarchical",
+ no_typical_periods=no_typical_periods,
+ hours_per_period=hours_per_period,
+ cluster_method="hierarchical",
)
print("Clustering took " + str(time.time() - starttime))
np.testing.assert_array_almost_equal(
- aggregation3.clusterPeriodIdx, np.arange(noTypicalPeriods), decimal=4
+ aggregation3.cluster_period_idx, np.arange(no_typical_periods), decimal=4
)
starttime = time.time()
aggregation4 = tsam.TimeSeriesAggregation(
raw,
- noTypicalPeriods=noTypicalPeriods,
- hoursPerPeriod=hoursPerPeriod,
- clusterMethod="hierarchical",
+ no_typical_periods=no_typical_periods,
+ hours_per_period=hours_per_period,
+ cluster_method="hierarchical",
segmentation=True,
- noSegments=noSegments,
+ no_segments=no_segments,
)
print("Clustering took " + str(time.time() - starttime))
- appearances = np.unique(aggregation4.clusterOrder, return_counts=True)[1].tolist()
+ appearances = np.unique(aggregation4.cluster_order, return_counts=True)[1].tolist()
occurrenceDict = {i: j for i, j in enumerate(appearances)}
- # make sure that the clusterPeriodNoOccur equals the number of appearances in the clusterOrder
+ # make sure that the cluster_period_no_occur equals the number of appearances in the cluster_order
np.testing.assert_array_almost_equal(
- list(aggregation4.clusterPeriodNoOccur.values()),
+ list(aggregation4.cluster_period_no_occur.values()),
list(occurrenceDict.values()),
decimal=4,
)
@@ -95,19 +95,19 @@ def test_properties():
aggregation5 = tsam.TimeSeriesAggregation(
raw,
- noTypicalPeriods=noTypicalPeriods,
- hoursPerPeriod=hoursPerPeriod,
- clusterMethod="hierarchical",
+ no_typical_periods=no_typical_periods,
+ hours_per_period=hours_per_period,
+ cluster_method="hierarchical",
segmentation=True,
- noSegments=noSegments,
+ no_segments=no_segments,
)
print("Clustering took " + str(time.time() - starttime))
- # make sure that the values of the clusterPeriodDict equal those from the typicalPeriods-dataframe
+ # make sure that the values of the cluster_period_dict equal those from the typicalPeriods-dataframe
np.testing.assert_array_almost_equal(
- pd.DataFrame.from_dict(data=aggregation5.clusterPeriodDict).values,
- aggregation5.createTypicalPeriods().values,
+ pd.DataFrame.from_dict(data=aggregation5.cluster_period_dict).values,
+ aggregation5.create_typical_periods().values,
decimal=4,
)
@@ -115,32 +115,32 @@ def test_properties():
aggregation6 = tsam.TimeSeriesAggregation(
raw,
- noTypicalPeriods=noTypicalPeriods,
- hoursPerPeriod=hoursPerPeriod,
- clusterMethod="hierarchical",
+ no_typical_periods=no_typical_periods,
+ hours_per_period=hours_per_period,
+ cluster_method="hierarchical",
)
print("Clustering took " + str(time.time() - starttime))
# make sure that the sum of all segment durations in each period equals the hours per period
- for i in range(noTypicalPeriods):
+ for i in range(no_typical_periods):
print(i)
- print(pd.DataFrame.from_dict(aggregation6.segmentDurationDict).loc[(i,), :])
+ print(pd.DataFrame.from_dict(aggregation6.segment_duration_dict).loc[(i,), :])
# print(
- # pd.DataFrame.from_dict(aggregation6.segmentDurationDict)
+ # pd.DataFrame.from_dict(aggregation6.segment_duration_dict)
# .iloc[
# pd.DataFrame.from_dict(
- # aggregation6.segmentDurationDict
+ # aggregation6.segment_duration_dict
# ).index.get_level_values(0)
# ]
# )
print("\n")
np.testing.assert_array_almost_equal(
- pd.DataFrame.from_dict(aggregation6.segmentDurationDict)
+ pd.DataFrame.from_dict(aggregation6.segment_duration_dict)
.loc[(i,), :]
.sum()
.iloc[0],
- hoursPerPeriod,
+ hours_per_period,
decimal=4,
)
print("")
@@ -149,23 +149,23 @@ def test_properties():
aggregation7 = tsam.TimeSeriesAggregation(
raw,
- noTypicalPeriods=noTypicalPeriods,
- hoursPerPeriod=hoursPerPeriod,
- clusterMethod="hierarchical",
+ no_typical_periods=no_typical_periods,
+ hours_per_period=hours_per_period,
+ cluster_method="hierarchical",
segmentation=True,
- noSegments=noSegments,
+ no_segments=no_segments,
)
print("Clustering took " + str(time.time() - starttime))
# make sure that the sum of all segment durations in each period equals the hours per period
- for i in range(noTypicalPeriods):
+ for i in range(no_typical_periods):
np.testing.assert_array_almost_equal(
- pd.DataFrame.from_dict(aggregation7.segmentDurationDict)
+ pd.DataFrame.from_dict(aggregation7.segment_duration_dict)
.loc[i]
.sum()
.iloc[0],
- hoursPerPeriod,
+ hours_per_period,
decimal=4,
)
@@ -173,33 +173,33 @@ def test_properties():
aggregation8 = tsam.TimeSeriesAggregation(
raw,
- noTypicalPeriods=noTypicalPeriods,
- hoursPerPeriod=hoursPerPeriod,
- clusterMethod="hierarchical",
+ no_typical_periods=no_typical_periods,
+ hours_per_period=hours_per_period,
+ cluster_method="hierarchical",
segmentation=True,
- noSegments=noSegments,
+ no_segments=no_segments,
)
print("Clustering took " + str(time.time() - starttime))
- indexTable = aggregation8.indexMatching()
+ indexTable = aggregation8.index_matching()
- # make sure that the PeriodNum column equals the clusterOrder
+ # make sure that the PeriodNum column equals the cluster_order
np.testing.assert_array_almost_equal(
- indexTable.loc[::24, "PeriodNum"].values, aggregation8.clusterOrder, decimal=4
+ indexTable.loc[::24, "PeriodNum"].values, aggregation8.cluster_order, decimal=4
)
- # make sure that the TimeStep indices equal the number of hoursPerPeriod arranged as array
+ # make sure that the TimeStep indices equal the number of hours_per_period arranged as array
np.testing.assert_array_almost_equal(
pd.unique(indexTable.loc[:, "TimeStep"]),
- np.arange(hoursPerPeriod, dtype="int64"),
+ np.arange(hours_per_period, dtype="int64"),
decimal=4,
)
- # make sure that the SegmentIndex indices equal the number of noSegments arranged as array
+ # make sure that the SegmentIndex indices equal the number of no_segments arranged as array
np.testing.assert_array_almost_equal(
pd.unique(indexTable.loc[:, "SegmentIndex"]),
- np.arange(noSegments, dtype="int64"),
+ np.arange(no_segments, dtype="int64"),
decimal=4,
)
diff --git a/test/test_reconstruct_samemean_segmentation.py b/test/test_reconstruct_samemean_segmentation.py
index 3e7df47a..568642ac 100644
--- a/test/test_reconstruct_samemean_segmentation.py
+++ b/test/test_reconstruct_samemean_segmentation.py
@@ -40,7 +40,7 @@ def _check_reconstruction_bounds(
self, result: AggregationResult, max_ratio: float = 1.5
):
"""Check that reconstructed values are within reasonable bounds."""
- original = result._aggregation.timeSeries
+ original = result.original
reconstructed = result.reconstructed
orig_max = original.max()
diff --git a/test/test_samemean.py b/test/test_samemean.py
index 46de88dd..ca27524d 100644
--- a/test/test_samemean.py
+++ b/test/test_samemean.py
@@ -27,23 +27,19 @@ def test_samemean():
os.environ["OMP_NUM_THREADS"] = "1"
aggregation = tsam.TimeSeriesAggregation(
raw,
- noTypicalPeriods=8,
- hoursPerPeriod=24,
- clusterMethod="k_means",
- sameMean=True,
+ no_typical_periods=8,
+ hours_per_period=24,
+ cluster_method="k_means",
+ same_mean=True,
)
- typPeriods = aggregation.createTypicalPeriods()
+ typPeriods = aggregation.create_typical_periods()
print("Clustering took " + str(time.time() - starttime))
- # test if the normalized time series all have the same mean
- means = aggregation.normalizedTimeSeries.mean().values
- np.testing.assert_allclose(means, np.array([means[0]] * len(means)), rtol=1e-5)
-
# repredict the original data
- rearangedData = aggregation.predictOriginalData()
+ rearangedData = aggregation.predict_original_data()
- # test if the mean fits the mean of the raw time series --> should always hold for k-means independent from sameMean True or False
+ # test if the mean fits the mean of the raw time series --> should always hold for k-means independent from same_mean True or False
np.testing.assert_array_almost_equal(
raw.mean(), rearangedData[raw.columns].mean(), decimal=4
)
diff --git a/test/test_segmentation.py b/test/test_segmentation.py
index 6baf5a0e..fe266822 100644
--- a/test/test_segmentation.py
+++ b/test/test_segmentation.py
@@ -22,15 +22,15 @@ def test_segmentation():
aggregation = tsam.TimeSeriesAggregation(
raw,
- noTypicalPeriods=20,
- hoursPerPeriod=24,
- clusterMethod="hierarchical",
- representationMethod="meanRepresentation",
+ no_typical_periods=20,
+ hours_per_period=24,
+ cluster_method="hierarchical",
+ representation_method="meanRepresentation",
segmentation=True,
- noSegments=12,
+ no_segments=12,
)
- typPeriods = aggregation.createTypicalPeriods()
+ typPeriods = aggregation.create_typical_periods()
print("Clustering took " + str(time.time() - starttime))
@@ -59,7 +59,7 @@ def test_segmentation():
def test_representation_in_segmentation():
- segmentationCandidates = np.array(
+ segmentation_candidates = np.array(
[
[0.0, 0.38936961, 0.27539063, 0.25],
[0.0, 0.35591778, 0.26841518, 0.25],
@@ -88,29 +88,29 @@ def test_representation_in_segmentation():
]
)
- clusterOrder = np.array(
+ cluster_order = np.array(
[5, 5, 5, 5, 5, 7, 3, 2, 2, 2, 2, 0, 0, 0, 0, 1, 1, 1, 1, 1, 6, 6, 4, 4]
)
- clusterCenters_mean, _clusterCenterIndices = tsam.representations(
- segmentationCandidates,
- clusterOrder,
+ cluster_centers_mean, _cluster_center_indices = tsam.representations(
+ segmentation_candidates,
+ cluster_order,
default="meanRepresentation",
- representationMethod="meanRepresentation",
- distributionPeriodWise=False,
- timeStepsPerPeriod=1,
+ representation_method="meanRepresentation",
+ distribution_period_wise=False,
+ n_timesteps_per_period=1,
)
- clusterCenters_dist, _clusterCenterIndices = tsam.representations(
- segmentationCandidates,
- clusterOrder,
+ cluster_centers_dist, _cluster_center_indices = tsam.representations(
+ segmentation_candidates,
+ cluster_order,
default="meanRepresentation",
- representationMethod="distributionRepresentation",
- distributionPeriodWise=True,
- timeStepsPerPeriod=1,
+ representation_method="distributionRepresentation",
+ distribution_period_wise=True,
+ n_timesteps_per_period=1,
)
- assert np.isclose(clusterCenters_mean, clusterCenters_dist).all()
+ assert np.isclose(cluster_centers_mean, cluster_centers_dist).all()
if __name__ == "__main__":
diff --git a/test/test_subhourlyResolution.py b/test/test_subhourlyResolution.py
index 0145c633..46e6730d 100644
--- a/test/test_subhourlyResolution.py
+++ b/test/test_subhourlyResolution.py
@@ -24,10 +24,10 @@ def test_subhourlyResolution():
starttime = time.time()
aggregation1 = tsam.TimeSeriesAggregation(
- raw, noTypicalPeriods=8, hoursPerPeriod=24, clusterMethod="hierarchical"
+ raw, no_typical_periods=8, hours_per_period=24, cluster_method="hierarchical"
)
- typPeriods1 = aggregation1.createTypicalPeriods()
+ typPeriods1 = aggregation1.create_typical_periods()
print("Clustering took " + str(time.time() - starttime))
@@ -36,12 +36,12 @@ def test_subhourlyResolution():
# cluster dataframe with 15 min. intervals to six hours per period, which equals 24 time steps per period
aggregation2 = tsam.TimeSeriesAggregation(
rawSubhourlyInndex,
- noTypicalPeriods=8,
- hoursPerPeriod=6,
- clusterMethod="hierarchical",
+ no_typical_periods=8,
+ hours_per_period=6,
+ cluster_method="hierarchical",
)
- typPeriods2 = aggregation2.createTypicalPeriods()
+ typPeriods2 = aggregation2.create_typical_periods()
print("Clustering took " + str(time.time() - starttime))
diff --git a/test/test_subhourly_periods.py b/test/test_subhourly_periods.py
index eda65f32..13d570e3 100644
--- a/test/test_subhourly_periods.py
+++ b/test/test_subhourly_periods.py
@@ -21,14 +21,14 @@ def test_subhourly_periods():
aggregation = tsam.TimeSeriesAggregation(
testData,
- noTypicalPeriods=8,
- hoursPerPeriod=0.25,
- clusterMethod="hierarchical",
+ no_typical_periods=8,
+ hours_per_period=0.25,
+ cluster_method="hierarchical",
segmentation=True,
- noSegments=1,
+ no_segments=1,
)
- results = aggregation.predictOriginalData()
+ results = aggregation.predict_original_data()
print("Clustering took " + str(time.time() - starttime))
diff --git a/test/test_weight_decoupling.py b/test/test_weight_decoupling.py
new file mode 100644
index 00000000..116abd16
--- /dev/null
+++ b/test/test_weight_decoupling.py
@@ -0,0 +1,257 @@
+"""Tests for the weight decoupling refactor.
+
+Verifies that per-column weights affect only clustering distance and
+are correctly handled for partial weight dicts, duration curves,
+serialization round-trips, and the deprecated property.
+"""
+
+from __future__ import annotations
+
+import warnings
+
+import numpy as np
+import pandas as pd
+import pytest
+
+from conftest import TESTDATA_CSV
+from tsam import ClusterConfig, ClusteringResult, aggregate
+from tsam.pipeline import _build_weight_vector
+from tsam.weights import MIN_WEIGHT, validate_weights
+
+
+@pytest.fixture
+def sample_data():
+ return pd.read_csv(TESTDATA_CSV, index_col=0, parse_dates=True)
+
+
+# ---------------------------------------------------------------------------
+# Unit tests for helpers
+# ---------------------------------------------------------------------------
+
+
+class TestBuildWeightVector:
+ """Unit tests for _build_weight_vector."""
+
+ def test_none_weights_returns_none(self):
+ cols = pd.Index(["A", "B", "C"])
+ assert _build_weight_vector(cols, None) is None
+
+ def test_empty_weights_returns_none(self):
+ cols = pd.Index(["A", "B", "C"])
+ assert _build_weight_vector(cols, {}) is None
+
+ def test_all_unit_weights_returns_none(self):
+ cols = pd.Index(["A", "B", "C"])
+ assert _build_weight_vector(cols, {"A": 1.0, "B": 1.0, "C": 1.0}) is None
+
+ def test_full_weights(self):
+ cols = pd.Index(["A", "B", "C"])
+ result = _build_weight_vector(cols, {"A": 2.0, "B": 1.0, "C": 3.0})
+ np.testing.assert_array_equal(result, [2.0, 1.0, 3.0])
+
+ def test_partial_weights_default_to_one(self):
+ """Unlisted columns must default to 1.0, not be omitted."""
+ cols = pd.Index(["A", "B", "C"])
+ result = _build_weight_vector(cols, {"B": 2.0})
+ np.testing.assert_array_equal(result, [1.0, 2.0, 1.0])
+
+ def test_min_weight_enforcement(self):
+ cols = pd.Index(["A", "B"])
+ with warnings.catch_warnings(record=True) as w:
+ warnings.simplefilter("always")
+ result = _build_weight_vector(cols, {"A": 0.0})
+ assert len(w) == 1
+ assert "minimal tolerable" in str(w[0].message)
+ assert result is not None
+ assert result[0] == pytest.approx(1e-6)
+ assert result[1] == 1.0
+
+ def test_preserves_column_order(self):
+ """Weights must follow the column order, not the dict order."""
+ cols = pd.Index(["C", "A", "B"])
+ result = _build_weight_vector(cols, {"A": 2.0, "B": 3.0, "C": 4.0})
+ np.testing.assert_array_equal(result, [4.0, 2.0, 3.0])
+
+
+# ---------------------------------------------------------------------------
+# Integration tests
+# ---------------------------------------------------------------------------
+
+
+class TestPartialWeights:
+ """Partial weight dicts must work correctly."""
+
+ def test_partial_weight_only_affects_specified_column(self, sample_data):
+ """Weighting only Load should change clustering vs unweighted."""
+ unweighted = aggregate(
+ sample_data,
+ n_clusters=8,
+ period_duration=24,
+ cluster=ClusterConfig(method="hierarchical"),
+ )
+ partial = aggregate(
+ sample_data,
+ n_clusters=8,
+ period_duration=24,
+ cluster=ClusterConfig(method="hierarchical", weights={"Load": 10.0}),
+ )
+ # With a very high weight on Load, Load's RMSE should improve
+ assert partial.accuracy.rmse["Load"] <= unweighted.accuracy.rmse["Load"]
+
+ def test_partial_weight_equals_full_with_defaults(self, sample_data):
+ """weights={"Load": 2.0} must equal {"Load": 2.0, "GHI": 1.0, "T": 1.0, "Wind": 1.0}."""
+ partial = aggregate(
+ sample_data,
+ n_clusters=8,
+ period_duration=24,
+ cluster=ClusterConfig(method="hierarchical", weights={"Load": 2.0}),
+ )
+ full = aggregate(
+ sample_data,
+ n_clusters=8,
+ period_duration=24,
+ cluster=ClusterConfig(
+ method="hierarchical",
+ weights={"Load": 2.0, "GHI": 1.0, "T": 1.0, "Wind": 1.0},
+ ),
+ )
+ np.testing.assert_array_equal(
+ partial.cluster_assignments, full.cluster_assignments
+ )
+ pd.testing.assert_frame_equal(
+ partial.cluster_representatives, full.cluster_representatives
+ )
+
+
+class TestDurationCurvesWithWeights:
+ """Weights must actually affect clustering when use_duration_curves=True."""
+
+ def test_weighted_duration_curves_differ_from_unweighted(self, sample_data):
+ unweighted = aggregate(
+ sample_data,
+ n_clusters=8,
+ period_duration=24,
+ cluster=ClusterConfig(method="hierarchical", use_duration_curves=True),
+ )
+ weighted = aggregate(
+ sample_data,
+ n_clusters=8,
+ period_duration=24,
+ cluster=ClusterConfig(
+ method="hierarchical",
+ use_duration_curves=True,
+ weights={"Load": 10.0, "GHI": 1.0, "T": 1.0, "Wind": 1.0},
+ ),
+ )
+ # With a very strong weight on Load, assignments should change
+ # (or at least Load RMSE should improve)
+ assert weighted.accuracy.rmse["Load"] <= unweighted.accuracy.rmse["Load"]
+
+
+class TestDeprecatedClusterWeightsProperty:
+ """The deprecated cluster_weights property must emit FutureWarning."""
+
+ def test_emits_future_warning(self, sample_data):
+ result = aggregate(
+ sample_data,
+ n_clusters=8,
+ period_duration=24,
+ cluster=ClusterConfig(method="hierarchical"),
+ )
+ with pytest.warns(FutureWarning, match="cluster_weights.*deprecated"):
+ _ = result.cluster_weights
+
+
+class TestWeightRoundTrip:
+ """ClusteringResult.apply() must preserve weights through serialization."""
+
+ def test_apply_with_weights(self, sample_data):
+ weights = {"Load": 2.0, "GHI": 1.0, "T": 1.0, "Wind": 1.0}
+ result1 = aggregate(
+ sample_data,
+ n_clusters=8,
+ period_duration=24,
+ cluster=ClusterConfig(method="hierarchical", weights=weights),
+ )
+
+ # Apply stored clustering to same data
+ result2 = result1.clustering.apply(sample_data)
+
+ np.testing.assert_array_equal(
+ result1.cluster_assignments, result2.cluster_assignments
+ )
+
+ def test_json_roundtrip_preserves_weights(self, sample_data, tmp_path):
+ weights = {"Load": 2.0, "GHI": 1.0, "T": 1.0, "Wind": 1.0}
+ result1 = aggregate(
+ sample_data,
+ n_clusters=8,
+ period_duration=24,
+ cluster=ClusterConfig(method="hierarchical", weights=weights),
+ )
+
+ # Serialize and deserialize
+ json_path = tmp_path / "clustering.json"
+ result1.clustering.to_json(str(json_path))
+ loaded = ClusteringResult.from_json(str(json_path))
+
+ # Verify weights survived
+ assert loaded.weights is not None
+ restored = dict(loaded.weights)
+ assert restored == weights
+
+ # Apply and verify identical assignments
+ result2 = loaded.apply(sample_data)
+ np.testing.assert_array_equal(
+ result1.cluster_assignments, result2.cluster_assignments
+ )
+
+
+# ---------------------------------------------------------------------------
+# validate_weights tests
+# ---------------------------------------------------------------------------
+
+
+class TestValidateWeights:
+ """Unit tests for the unified validate_weights() function."""
+
+ def test_none_returns_none(self):
+ assert validate_weights(pd.Index(["A", "B"]), None) is None
+
+ def test_empty_returns_none(self):
+ assert validate_weights(pd.Index(["A", "B"]), {}) is None
+
+ def test_all_unit_returns_none(self):
+ assert validate_weights(pd.Index(["A", "B"]), {"A": 1.0, "B": 1.0}) is None
+
+ def test_valid_weights_returned(self):
+ result = validate_weights(pd.Index(["A", "B"]), {"A": 2.0, "B": 1.0})
+ assert result == {"A": 2.0, "B": 1.0}
+
+ def test_missing_column_raises(self):
+ with pytest.raises(ValueError, match="Weight columns not found"):
+ validate_weights(pd.Index(["A", "B"]), {"A": 1.0, "Z": 2.0})
+
+ def test_min_weight_clamping(self):
+ with warnings.catch_warnings(record=True) as w:
+ warnings.simplefilter("always")
+ result = validate_weights(pd.Index(["A", "B"]), {"A": 0.0, "B": 1.0})
+ assert len(w) == 1
+ assert "minimal tolerable" in str(w[0].message)
+ assert result is not None
+ assert result["A"] == pytest.approx(MIN_WEIGHT)
+
+ def test_old_wrapper_rejects_invalid_columns(self, sample_data):
+ """Old wrapper now raises ValueError on invalid weight column names."""
+ with warnings.catch_warnings():
+ warnings.simplefilter("ignore")
+ from tsam.timeseriesaggregation import TimeSeriesAggregation
+
+ agg = TimeSeriesAggregation(
+ time_series=sample_data,
+ no_typical_periods=8,
+ hours_per_period=24,
+ weight_dict={"NonExistent": 2.0},
+ )
+ with pytest.raises(ValueError, match="Weight columns not found"):
+ agg.create_typical_periods()
diff --git a/test/test_weight_handling.py b/test/test_weight_handling.py
new file mode 100644
index 00000000..2c6788a5
--- /dev/null
+++ b/test/test_weight_handling.py
@@ -0,0 +1,539 @@
+"""Thorough tests for weight handling in tsam.
+
+Tests that weights affect ONLY clustering distance, and do not leak into:
+- output scale (typical_periods, predicted_data)
+- rescaling behavior
+- accuracy indicators
+- reconstructed column means
+"""
+
+import pandas as pd
+import pytest
+
+import tsam.timeseriesaggregation as tsam
+from conftest import TESTDATA_CSV
+
+pytestmark = pytest.mark.filterwarnings("ignore::tsam.exceptions.LegacyAPIWarning")
+
+RAW = pd.read_csv(TESTDATA_CSV, index_col=0)
+N_TYPICAL = 8
+HOURS_PER_PERIOD = 24
+
+
+def _make_agg(weight_dict=None, **kwargs):
+ defaults = {
+ "no_typical_periods": N_TYPICAL,
+ "hours_per_period": HOURS_PER_PERIOD,
+ "cluster_method": "hierarchical",
+ }
+ defaults.update(kwargs)
+ if weight_dict is not None:
+ defaults["weight_dict"] = weight_dict
+ agg = tsam.TimeSeriesAggregation(RAW.copy(), **defaults)
+ agg.create_typical_periods()
+ return agg
+
+
+# ---------------------------------------------------------------------------
+# 1. Output range: typical_periods must stay within original data bounds
+# ---------------------------------------------------------------------------
+
+
+class TestOutputRange:
+ """typical_periods and predicted_data must not exceed original data bounds
+ regardless of weight configuration."""
+
+ @pytest.mark.parametrize(
+ "weights",
+ [
+ None,
+ {"GHI": 1, "T": 1, "Wind": 1, "Load": 1},
+ {"GHI": 5, "T": 1, "Wind": 1, "Load": 1},
+ {"GHI": 0.1, "T": 10, "Wind": 0.5, "Load": 3},
+ {"GHI": 100, "T": 100, "Wind": 100, "Load": 100},
+ ],
+ )
+ def test_typical_periods_within_bounds(self, weights):
+ agg = _make_agg(weights)
+ tp = agg.typical_periods
+
+ for col in RAW.columns:
+ col_min = RAW[col].min()
+ col_max = RAW[col].max()
+ assert tp[col].min() >= col_min - 1e-6, (
+ f"{col}: typical min {tp[col].min()} < data min {col_min}"
+ )
+ assert tp[col].max() <= col_max + 1e-6, (
+ f"{col}: typical max {tp[col].max()} > data max {col_max}"
+ )
+
+ @pytest.mark.parametrize(
+ "weights",
+ [
+ None,
+ {"GHI": 5, "T": 1, "Wind": 1, "Load": 1},
+ {"GHI": 0.1, "T": 10, "Wind": 0.5, "Load": 3},
+ ],
+ )
+ def test_predicted_data_within_bounds(self, weights):
+ agg = _make_agg(weights)
+ pred = agg.predict_original_data()
+
+ for col in RAW.columns:
+ col_min = RAW[col].min()
+ col_max = RAW[col].max()
+ assert pred[col].min() >= col_min - 1e-6, (
+ f"{col}: pred min {pred[col].min()} < data min {col_min}"
+ )
+ assert pred[col].max() <= col_max + 1e-6, (
+ f"{col}: pred max {pred[col].max()} > data max {col_max}"
+ )
+
+
+# ---------------------------------------------------------------------------
+# 2. Uniform weights == no weights (identical cluster assignments → identical output)
+# ---------------------------------------------------------------------------
+
+
+class TestUniformWeightsEquivalence:
+ """Uniform weights (all same value) should produce identical results to no weights."""
+
+ @pytest.mark.parametrize("uniform_weight", [1, 2, 0.5, 10, 100])
+ def test_uniform_weights_typical_periods(self, uniform_weight):
+ agg_none = _make_agg(None)
+ agg_uniform = _make_agg(dict.fromkeys(RAW.columns, uniform_weight))
+
+ pd.testing.assert_frame_equal(
+ agg_none.typical_periods,
+ agg_uniform.typical_periods,
+ atol=1e-6,
+ )
+
+ @pytest.mark.parametrize("uniform_weight", [1, 2, 10])
+ def test_uniform_weights_predicted_data(self, uniform_weight):
+ agg_none = _make_agg(None)
+ agg_uniform = _make_agg(dict.fromkeys(RAW.columns, uniform_weight))
+
+ pd.testing.assert_frame_equal(
+ agg_none.predict_original_data(),
+ agg_uniform.predict_original_data(),
+ atol=1e-6,
+ )
+
+ @pytest.mark.parametrize("uniform_weight", [1, 2, 10])
+ def test_uniform_weights_accuracy(self, uniform_weight):
+ agg_none = _make_agg(None)
+ agg_uniform = _make_agg(dict.fromkeys(RAW.columns, uniform_weight))
+
+ pd.testing.assert_frame_equal(
+ agg_none.accuracy_indicators(),
+ agg_uniform.accuracy_indicators(),
+ atol=1e-6,
+ )
+
+
+# ---------------------------------------------------------------------------
+# 3. Rescaling: column means should be preserved
+# ---------------------------------------------------------------------------
+
+
+class TestRescalePreservesMeans:
+ """After rescaling, reconstructed data should preserve the original
+ column means (within tolerance), regardless of weights."""
+
+ @pytest.mark.parametrize(
+ "weights",
+ [
+ None,
+ {"GHI": 1, "T": 1, "Wind": 1, "Load": 1},
+ {"GHI": 5, "T": 1, "Wind": 1, "Load": 1},
+ {"GHI": 0.1, "T": 10, "Wind": 0.5, "Load": 3},
+ ],
+ )
+ def test_predicted_data_preserves_column_means(self, weights):
+ agg = _make_agg(weights, rescale_cluster_periods=True)
+ pred = agg.predict_original_data()
+
+ for col in RAW.columns:
+ orig_mean = RAW[col].mean()
+ pred_mean = pred[col].mean()
+ if orig_mean == 0:
+ assert abs(pred_mean) < 1e-6
+ else:
+ rel_error = abs(pred_mean - orig_mean) / abs(orig_mean)
+ assert rel_error < 0.02, (
+ f"{col}: mean relative error {rel_error:.4f} exceeds 2% "
+ f"(orig={orig_mean:.4f}, pred={pred_mean:.4f})"
+ )
+
+
+# ---------------------------------------------------------------------------
+# 4. Weights affect clustering but not scale
+# ---------------------------------------------------------------------------
+
+
+class TestWeightsAffectOnlyClustering:
+ """Non-uniform weights should change cluster assignments but the
+ output (typical_periods) should still be in the original data scale."""
+
+ def test_non_uniform_weights_change_assignments(self):
+ """Different weights should (in general) give different cluster orders."""
+ agg1 = _make_agg({"GHI": 1, "T": 1, "Wind": 1, "Load": 1})
+ agg3 = _make_agg({"GHI": 10, "T": 1, "Wind": 1, "Load": 1})
+
+ order1 = list(agg1._cluster_order)
+ order3 = list(agg3._cluster_order)
+ assert order1 != order3, (
+ "Expected different cluster orders with extreme weight diff"
+ )
+
+ def test_weight_does_not_scale_output(self):
+ """Even with extreme weights, output values should be in original data range."""
+ agg = _make_agg({"GHI": 100, "T": 0.01, "Wind": 1, "Load": 1})
+ tp = agg.typical_periods
+
+ # GHI should NOT be 100x its original range
+ assert tp["GHI"].max() <= RAW["GHI"].max() + 1e-6
+ # T should NOT be 0.01x its original range
+ assert tp["T"].min() >= RAW["T"].min() - 1e-6
+
+
+# ---------------------------------------------------------------------------
+# 5. Weight interaction with sameMean
+# ---------------------------------------------------------------------------
+
+
+class TestWeightsWithSameMean:
+ """Weights combined with same_mean=True should not produce out-of-range values."""
+
+ @pytest.mark.parametrize(
+ "weights",
+ [
+ None,
+ {"GHI": 5, "T": 1, "Wind": 1, "Load": 1},
+ {"GHI": 0.1, "T": 10, "Wind": 0.5, "Load": 3},
+ ],
+ )
+ def test_same_mean_output_in_range(self, weights):
+ agg = _make_agg(weights, same_mean=True)
+ tp = agg.typical_periods
+
+ for col in RAW.columns:
+ assert tp[col].min() >= RAW[col].min() - 1e-6
+ assert tp[col].max() <= RAW[col].max() + 1e-6
+
+ def test_same_mean_uniform_weights_equal_no_weights(self):
+ agg_none = _make_agg(None, same_mean=True)
+ agg_uniform = _make_agg(dict.fromkeys(RAW.columns, 3), same_mean=True)
+ pd.testing.assert_frame_equal(
+ agg_none.typical_periods,
+ agg_uniform.typical_periods,
+ atol=1e-6,
+ )
+
+ @pytest.mark.parametrize(
+ "weights",
+ [
+ None,
+ {"GHI": 5, "T": 1, "Wind": 1, "Load": 1},
+ {"GHI": 0.1, "T": 10, "Wind": 0.5, "Load": 3},
+ ],
+ )
+ def test_same_mean_preserves_column_means(self, weights):
+ agg = _make_agg(weights, same_mean=True, rescale_cluster_periods=True)
+ pred = agg.predict_original_data()
+
+ for col in RAW.columns:
+ orig_mean = RAW[col].mean()
+ pred_mean = pred[col].mean()
+ if orig_mean == 0:
+ continue
+ rel_error = abs(pred_mean - orig_mean) / abs(orig_mean)
+ assert rel_error < 0.02, (
+ f"{col}: sameMean + weight mean error {rel_error:.4f} > 2%"
+ )
+
+
+# ---------------------------------------------------------------------------
+# 6. Weight interaction with extreme periods
+# ---------------------------------------------------------------------------
+
+
+class TestWeightsWithExtremePeriods:
+ """Weights should not distort extreme period selection or values."""
+
+ @pytest.mark.parametrize(
+ "weights",
+ [
+ None,
+ {"GHI": 5, "T": 1, "Wind": 1, "Load": 1},
+ ],
+ )
+ def test_extreme_periods_in_range(self, weights):
+ agg = _make_agg(
+ weights,
+ add_peak_max=["GHI"],
+ add_peak_min=["T"],
+ )
+ tp = agg.typical_periods
+
+ for col in RAW.columns:
+ assert tp[col].min() >= RAW[col].min() - 1e-6
+ assert tp[col].max() <= RAW[col].max() + 1e-6
+
+
+# ---------------------------------------------------------------------------
+# 7. Accuracy indicators should be unaffected by weight scale
+# ---------------------------------------------------------------------------
+
+
+class TestAccuracyIndicatorsConsistency:
+ """Accuracy indicators should compare unweighted data —
+ they should not be inflated or deflated by weight magnitude."""
+
+ def test_accuracy_values_reasonable(self):
+ """RMSE and MAE should be between 0 and 1 (on normalized data)."""
+ agg = _make_agg({"GHI": 10, "T": 1, "Wind": 1, "Load": 1})
+ acc = agg.accuracy_indicators()
+
+ for col in RAW.columns:
+ rmse = acc.loc[col, "RMSE"]
+ mae = acc.loc[col, "MAE"]
+ assert 0 <= rmse <= 2, f"{col} RMSE={rmse} out of reasonable range"
+ assert 0 <= mae <= 2, f"{col} MAE={mae} out of reasonable range"
+
+ def test_weight_scaling_does_not_inflate_metrics(self):
+ """Doubling one weight should not double that column's RMSE."""
+ agg1 = _make_agg({"GHI": 1, "T": 1, "Wind": 1, "Load": 1})
+ agg2 = _make_agg({"GHI": 2, "T": 1, "Wind": 1, "Load": 1})
+
+ rmse1 = agg1.accuracy_indicators().loc["GHI", "RMSE"]
+ rmse2 = agg2.accuracy_indicators().loc["GHI", "RMSE"]
+
+ if rmse1 > 0:
+ ratio = rmse2 / rmse1
+ assert ratio < 1.8, (
+ f"GHI RMSE ratio {ratio:.2f} suggests weight leaked into metric"
+ )
+
+
+# ---------------------------------------------------------------------------
+# 8. Partial weight dict (not all columns specified)
+# ---------------------------------------------------------------------------
+
+
+class TestPartialWeightDict:
+ """When weight_dict only specifies some columns, others should be unaffected."""
+
+ def test_partial_weights_runs(self):
+ """Should not crash when only some columns are weighted."""
+ agg = _make_agg({"GHI": 5})
+ tp = agg.typical_periods
+ assert tp.shape[1] == len(RAW.columns)
+
+ def test_partial_weights_output_in_range(self):
+ agg = _make_agg({"GHI": 5})
+ tp = agg.typical_periods
+ for col in RAW.columns:
+ assert tp[col].min() >= RAW[col].min() - 1e-6
+ assert tp[col].max() <= RAW[col].max() + 1e-6
+
+ @pytest.mark.xfail(
+ reason="accuracyIndicators KeyError with partial weightDict - pre-existing bug"
+ )
+ def test_partial_weights_accuracy(self):
+ """Accuracy indicators should work with partial weight dicts."""
+ agg = _make_agg({"GHI": 5})
+ acc = agg.accuracy_indicators()
+ assert set(acc.index) == set(RAW.columns)
+
+
+# ---------------------------------------------------------------------------
+# 9. Edge case: very large / very small weights
+# ---------------------------------------------------------------------------
+
+
+class TestExtremeWeights:
+ """Extreme weight values should not break the pipeline."""
+
+ def test_very_large_weight(self):
+ agg = _make_agg({"GHI": 1000, "T": 1, "Wind": 1, "Load": 1})
+ tp = agg.typical_periods
+ assert not tp.isnull().any().any(), "NaN in output with large weight"
+ assert tp["GHI"].max() <= RAW["GHI"].max() + 1e-6
+
+ def test_very_small_weight(self):
+ """Very small weights should be clamped to MIN_WEIGHT, not zero."""
+ agg = _make_agg({"GHI": 1e-10, "T": 1, "Wind": 1, "Load": 1})
+ tp = agg.typical_periods
+ assert not tp.isnull().any().any(), "NaN in output with tiny weight"
+
+ def test_zero_weight_clamped(self):
+ """Zero weight should be clamped, not cause division by zero."""
+ agg = _make_agg({"GHI": 0, "T": 1, "Wind": 1, "Load": 1})
+ tp = agg.typical_periods
+ assert not tp.isnull().any().any(), "NaN in output with zero weight"
+
+
+# ---------------------------------------------------------------------------
+# 10. Weight x rescale interaction: scale_ub correctness
+# ---------------------------------------------------------------------------
+
+
+class TestRescaleScaleUb:
+ """The rescaling clip bound (scale_ub) should not cause weighted columns
+ to be clipped differently in a way that breaks reconstruction."""
+
+ def test_rescale_with_high_weight_preserves_mean(self):
+ """A column with very high weight should still have its mean preserved."""
+ agg = _make_agg(
+ {"GHI": 50, "T": 1, "Wind": 1, "Load": 1},
+ rescale_cluster_periods=True,
+ )
+ pred = agg.predict_original_data()
+
+ orig_mean = RAW["GHI"].mean()
+ pred_mean = pred["GHI"].mean()
+ if orig_mean > 0:
+ rel_error = abs(pred_mean - orig_mean) / orig_mean
+ assert rel_error < 0.02, (
+ f"GHI mean rel error {rel_error:.4f} with high weight — "
+ f"scale_ub may be distorting rescaling"
+ )
+
+ def test_rescale_without_weights_preserves_mean(self):
+ """Baseline: rescaling without weights should preserve means well."""
+ agg = _make_agg(None, rescale_cluster_periods=True)
+ pred = agg.predict_original_data()
+
+ for col in RAW.columns:
+ orig_mean = RAW[col].mean()
+ pred_mean = pred[col].mean()
+ if orig_mean == 0:
+ continue
+ rel_error = abs(pred_mean - orig_mean) / abs(orig_mean)
+ assert rel_error < 0.02, (
+ f"{col}: baseline rescale rel error {rel_error:.4f}"
+ )
+
+
+# ---------------------------------------------------------------------------
+# 11. Weight x k-means (different clustering method)
+# ---------------------------------------------------------------------------
+
+
+class TestWeightsWithKMeans:
+ """Weight handling should be consistent across clustering methods."""
+
+ def test_kmeans_uniform_weights_match_no_weights(self):
+ """k-means is non-deterministic; uniform scaling changes centroid init.
+ We only check that the output is reasonable, not bit-identical."""
+ agg_uniform = _make_agg(dict.fromkeys(RAW.columns, 3), cluster_method="k_means")
+ tp = agg_uniform.typical_periods
+ for col in RAW.columns:
+ assert tp[col].max() <= RAW[col].max() + 1e-6
+ assert tp[col].min() >= RAW[col].min() - 1e-6
+
+ def test_kmeans_output_in_range(self):
+ agg = _make_agg(
+ {"GHI": 5, "T": 1, "Wind": 1, "Load": 1},
+ cluster_method="k_means",
+ )
+ tp = agg.typical_periods
+ for col in RAW.columns:
+ assert tp[col].max() <= RAW[col].max() + 1e-6
+ assert tp[col].min() >= RAW[col].min() - 1e-6
+
+
+# ---------------------------------------------------------------------------
+# 12. Weight x segmentation (the core bug: weights leaked into reconstruction)
+# ---------------------------------------------------------------------------
+
+
+class TestWeightsWithSegmentation:
+ """Weights must not leak into segmented reconstruction.
+ Before the fix, predict_original_data with segmentation returned values
+ scaled by the weight (e.g. GHI=100 → output 100x too large)."""
+
+ def test_segmentation_uniform_weights_equal_no_weights(self):
+ """Uniform weights + segmentation must match no weights."""
+ agg_none = _make_agg(None, segmentation=True, no_segments=4)
+ agg_uniform = _make_agg(
+ dict.fromkeys(RAW.columns, 100), segmentation=True, no_segments=4
+ )
+ pd.testing.assert_frame_equal(
+ agg_none.predict_original_data(),
+ agg_uniform.predict_original_data(),
+ atol=1e-6,
+ )
+
+ @pytest.mark.parametrize(
+ "weights",
+ [
+ {"GHI": 100, "T": 1, "Wind": 1, "Load": 1},
+ {"GHI": 0.1, "T": 10, "Wind": 0.5, "Load": 3},
+ ],
+ )
+ def test_segmentation_output_in_range(self, weights):
+ agg = _make_agg(weights, segmentation=True, no_segments=4)
+ pred = agg.predict_original_data()
+ for col in RAW.columns:
+ assert pred[col].min() >= RAW[col].min() - 1e-6, (
+ f"{col}: pred min {pred[col].min()} < data min {RAW[col].min()}"
+ )
+ assert pred[col].max() <= RAW[col].max() + 1e-6, (
+ f"{col}: pred max {pred[col].max()} > data max {RAW[col].max()}"
+ )
+
+ @pytest.mark.parametrize(
+ "weights",
+ [
+ {"GHI": 100, "T": 1, "Wind": 1, "Load": 1},
+ {"GHI": 0.1, "T": 10, "Wind": 0.5, "Load": 3},
+ ],
+ )
+ def test_segmentation_preserves_column_means(self, weights):
+ """Reconstructed means should be close to original, not scaled by weight."""
+ agg = _make_agg(
+ weights, segmentation=True, no_segments=4, rescale_cluster_periods=True
+ )
+ pred = agg.predict_original_data()
+ for col in RAW.columns:
+ orig_mean = RAW[col].mean()
+ pred_mean = pred[col].mean()
+ if orig_mean == 0:
+ continue
+ rel_error = abs(pred_mean - orig_mean) / abs(orig_mean)
+ assert rel_error < 0.05, (
+ f"{col}: segmentation mean error {rel_error:.4f} > 5% "
+ f"(orig={orig_mean:.4f}, pred={pred_mean:.4f})"
+ )
+
+ def test_segmentation_samemean_weights(self):
+ """same_mean + segmentation + weights must not produce scaled output."""
+ agg = _make_agg(
+ {"GHI": 100, "T": 1, "Wind": 1, "Load": 1},
+ segmentation=True,
+ no_segments=4,
+ same_mean=True,
+ )
+ pred = agg.predict_original_data()
+ for col in RAW.columns:
+ assert pred[col].min() >= RAW[col].min() - 1e-6
+ assert pred[col].max() <= RAW[col].max() + 1e-6
+
+ def test_segmentation_typical_periods_in_range(self):
+ """typical_periods with segmentation + weights should be in range."""
+ agg = _make_agg(
+ {"GHI": 100, "T": 1, "Wind": 1, "Load": 1},
+ segmentation=True,
+ no_segments=4,
+ )
+ tp = agg.typical_periods
+ for col in RAW.columns:
+ assert tp[col].min() >= RAW[col].min() - 1e-6
+ assert tp[col].max() <= RAW[col].max() + 1e-6
+
+
+if __name__ == "__main__":
+ pytest.main([__file__, "-v"])
diff --git a/test/test_weightingFactors.py b/test/test_weightingFactors.py
index 49bbd2e6..140eec98 100644
--- a/test/test_weightingFactors.py
+++ b/test/test_weightingFactors.py
@@ -9,9 +9,9 @@
def test_weightingFactors():
- hoursPerPeriod = 24
+ hours_per_period = 24
- noTypicalPeriods = 8
+ no_typical_periods = 8
weightDict1 = {"GHI": 1, "T": 1, "Wind": 1, "Load": 1}
@@ -23,43 +23,43 @@ def test_weightingFactors():
aggregation1 = tsam.TimeSeriesAggregation(
raw,
- noTypicalPeriods=noTypicalPeriods,
- hoursPerPeriod=hoursPerPeriod,
- clusterMethod="hierarchical",
- weightDict=weightDict1,
+ no_typical_periods=no_typical_periods,
+ hours_per_period=hours_per_period,
+ cluster_method="hierarchical",
+ weight_dict=weightDict1,
)
aggregation2 = tsam.TimeSeriesAggregation(
raw,
- noTypicalPeriods=noTypicalPeriods,
- hoursPerPeriod=hoursPerPeriod,
- clusterMethod="hierarchical",
- weightDict=weightDict2,
+ no_typical_periods=no_typical_periods,
+ hours_per_period=hours_per_period,
+ cluster_method="hierarchical",
+ weight_dict=weightDict2,
)
aggregation3 = tsam.TimeSeriesAggregation(
raw,
- noTypicalPeriods=noTypicalPeriods,
- hoursPerPeriod=hoursPerPeriod,
- clusterMethod="hierarchical",
- weightDict=weightDict3,
+ no_typical_periods=no_typical_periods,
+ hours_per_period=hours_per_period,
+ cluster_method="hierarchical",
+ weight_dict=weightDict3,
)
# make sure that the accuracy indicators stay the same when the different attributes are equally overweighted
np.testing.assert_almost_equal(
- aggregation1.accuracyIndicators().values,
- aggregation2.accuracyIndicators().values,
+ aggregation1.accuracy_indicators().values,
+ aggregation2.accuracy_indicators().values,
decimal=6,
)
# make sure that the RMSE of GHI is less while the other RMSEs are bigger, when GHI is overweighted
np.testing.assert_array_less(
- aggregation3.accuracyIndicators().loc["GHI", "RMSE"],
- aggregation1.accuracyIndicators().loc["GHI", "RMSE"],
+ aggregation3.accuracy_indicators().loc["GHI", "RMSE"],
+ aggregation1.accuracy_indicators().loc["GHI", "RMSE"],
)
np.testing.assert_array_less(
- aggregation1.accuracyIndicators().loc[["Load", "T", "Wind"], "RMSE"],
- aggregation3.accuracyIndicators().loc[["Load", "T", "Wind"], "RMSE"],
+ aggregation1.accuracy_indicators().loc[["Load", "T", "Wind"], "RMSE"],
+ aggregation3.accuracy_indicators().loc[["Load", "T", "Wind"], "RMSE"],
)