diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 8868abb..d19ebcc 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -14,7 +14,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: [3.7, 3.8] + python-version: [3.7] steps: - uses: actions/checkout@v2 diff --git a/src/timeseers/constant.py b/src/timeseers/constant.py index ab080b5..86e612d 100644 --- a/src/timeseers/constant.py +++ b/src/timeseers/constant.py @@ -1,6 +1,6 @@ import numpy as np from timeseers.timeseries_model import TimeSeriesModel -from timeseers.utils import add_subplot, get_group_definition +from timeseers.utils import add_subplot, get_group_definition, invert_dict import pymc3 as pm @@ -28,7 +28,17 @@ def definition(self, model, X, scale_factor): return c[group] - def _predict(self, trace, t, pool_group=0): + def _predict(self, trace, X): + t = X['t'] + if self.pool_type == 'complete': + pool_group = np.zeros(len(X), dtype=np.int) + else: + pool_group = X[self.pool_cols].map(invert_dict(self.groups_)) + ind = trace[self._param_name("c")][np.arange(len(X)), pool_group] + + return np.ones_like(t)[:, None] * ind.reshape(1, -1) + + def _plot_predict(self, trace, t, pool_group=0): ind = trace[self._param_name("c")][:, pool_group] return np.ones_like(t)[:, None] * ind.reshape(1, -1) @@ -39,7 +49,7 @@ def plot(self, trace, scaled_t, y_scaler): trend_return = np.empty((len(scaled_t), len(self.groups_))) plot_data = [] for group_code, group_name in self.groups_.items(): - y_hat = np.mean(self._predict(trace, scaled_t, group_code), axis=1) + y_hat = np.mean(self._plot_predict(trace, scaled_t, group_code), axis=1) trend_return[:, group_code] = y_hat plot_data.append((group_name, y_hat[0])) ax.bar(*zip(*plot_data)) diff --git a/src/timeseers/fourier_seasonality.py b/src/timeseers/fourier_seasonality.py index ace83de..279bd6c 100644 --- a/src/timeseers/fourier_seasonality.py +++ b/src/timeseers/fourier_seasonality.py @@ -2,7 +2,7 @@ import pandas as pd import pymc3 as pm from timeseers.timeseries_model import TimeSeriesModel -from timeseers.utils import add_subplot, get_group_definition +from timeseers.utils import add_subplot, get_group_definition, invert_dict class FourierSeasonality(TimeSeriesModel): @@ -54,22 +54,32 @@ def definition(self, model, X, scale_factor): return seasonality - def _predict(self, trace, t, pool_group=0): - return self._X_t(t, self.p_, self.n) @ trace[self._param_name("beta")][:, pool_group].T + def _predict(self, trace, X): + t = X['t'] + if self.pool_type == 'complete': + pool_group = np.zeros(len(X), dtype=np.int) + else: + pool_group = X[self.pool_cols].map(invert_dict(self.groups_)) + l = self._X_t(t, self.p_, self.n)[..., None] + r = trace[self._param_name("beta")][:, pool_group].transpose(1, 2, 0) - def plot(self, trace, scaled_t, y_scaler): + return (l * r).sum(axis=1) + + def plot(self, trace, X, y_scaler): ax = add_subplot() ax.set_title(str(self)) - seasonality_return = np.empty((len(scaled_t), len(self.groups_))) + scaled_s = self._predict(trace, X) + s = y_scaler.inv_transform(scaled_s) for group_code, group_name in self.groups_.items(): - scaled_s = self._predict(trace, scaled_t, group_code) - s = y_scaler.inv_transform(scaled_s) - ax.plot(list(range(self.period.days)), s.mean(axis=1)[:self.period.days], label=group_name) - - seasonality_return[:, group_code] = scaled_s.mean(axis=1) + mask = X[self.pool_cols] == group_name if self.pool_cols is not None else np.full(len(s), True) + ax.plot( + list(range(self.period.days)), + s[mask].mean(axis=1)[:self.period.days], + label=group_name + ) - return seasonality_return + return scaled_s def __repr__(self): return f"FourierSeasonality(n={self.n}, " \ diff --git a/src/timeseers/indicator.py b/src/timeseers/indicator.py index d0f0f87..1a83a40 100644 --- a/src/timeseers/indicator.py +++ b/src/timeseers/indicator.py @@ -1,6 +1,6 @@ import numpy as np from timeseers.timeseries_model import TimeSeriesModel -from timeseers.utils import add_subplot, get_group_definition +from timeseers.utils import add_subplot, get_group_definition, invert_dict import pymc3 as pm from scipy.stats import mode @@ -24,7 +24,17 @@ def definition(self, model, X, scale_factor): return ind[group] - def _predict(self, trace, t, pool_group=0): + def _predict(self, trace, X): + t = X['t'] + if self.pool_type == 'complete': + pool_group = np.zeros(len(X), dtype=np.int) + else: + pool_group = X[self.pool_cols].map(invert_dict(self.groups_)) + ind = trace[self._param_name("ind")][np.arange(len(t)), pool_group] + + return np.ones_like(t)[:, None] * ind.reshape(1, -1) + + def _plot_predict(self, trace, t, pool_group=0): ind = trace[self._param_name("ind")][:, pool_group] return np.ones_like(t)[:, None] * ind.reshape(1, -1) @@ -35,7 +45,7 @@ def plot(self, trace, scaled_t, y_scaler): ax.set_xticks([]) trend_return = np.empty((len(scaled_t), len(self.groups_))) for group_code, group_name in self.groups_.items(): - y_hat = mode(self._predict(trace, scaled_t, group_code), axis=1)[0][:, 0] + y_hat = mode(self._plot_predict(trace, scaled_t, group_code), axis=1)[0][:, 0] ax.plot(scaled_t, y_hat, label=group_name) trend_return[:, group_code] = y_hat ax.set_ylim([-1.05, 1.05]) diff --git a/src/timeseers/linear_trend.py b/src/timeseers/linear_trend.py index 5d19960..640ff76 100644 --- a/src/timeseers/linear_trend.py +++ b/src/timeseers/linear_trend.py @@ -1,6 +1,6 @@ import numpy as np from timeseers.timeseries_model import TimeSeriesModel -from timeseers.utils import add_subplot, get_group_definition +from timeseers.utils import add_subplot, get_group_definition, invert_dict import pymc3 as pm @@ -50,7 +50,22 @@ def definition(self, model, X, scale_factor): ) return g - def _predict(self, trace, t, pool_group=0): + def _predict(self, trace, X): + t = X['t'] + A = (t[:, None] > self.s) * 1 + if self.pool_type == 'complete': + pool_group = np.zeros(len(X), dtype=np.int) + else: + pool_group = X[self.pool_cols].map(invert_dict(self.groups_)) + + idx = np.arange(len(X)) + k, m = trace[self._param_name("k")][idx, pool_group], trace[self._param_name("m")][idx, pool_group] + growth = k + A @ trace[self._param_name("delta")][idx, pool_group].T + gamma = -self.s[:, None] * trace[self._param_name("delta")][idx, pool_group].T + offset = m + A @ gamma + return growth * t[:, None] + offset + + def _plot_predict(self, trace, t, pool_group=0): A = (t[:, None] > self.s) * 1 k, m = trace[self._param_name("k")][:, pool_group], trace[self._param_name("m")][:, pool_group] @@ -65,7 +80,7 @@ def plot(self, trace, scaled_t, y_scaler): ax.set_xticks([]) trend_return = np.empty((len(scaled_t), len(self.groups_))) for group_code, group_name in self.groups_.items(): - scaled_trend = self._predict(trace, scaled_t, group_code) + scaled_trend = self._plot_predict(trace, scaled_t, group_code) trend = y_scaler.inv_transform(scaled_trend) ax.plot(scaled_t, trend.mean(axis=1), label=group_name) trend_return[:, group_code] = scaled_trend.mean(axis=1) diff --git a/src/timeseers/logistic_growth.py b/src/timeseers/logistic_growth.py index 23ecd3d..cfac3d4 100644 --- a/src/timeseers/logistic_growth.py +++ b/src/timeseers/logistic_growth.py @@ -2,7 +2,7 @@ import theano.tensor as T import theano from timeseers.timeseries_model import TimeSeriesModel -from timeseers.utils import add_subplot, get_group_definition +from timeseers.utils import add_subplot, get_group_definition, invert_dict import pymc3 as pm @@ -85,8 +85,34 @@ def get_gamma(i, gamma_init, delta, m, k, s): growth = self.cap_scaled / (1 + pm.math.exp(-growth)) return growth - def _predict(self, trace, t, pool_group=0): + def _predict(self, trace, X): + t = X['t'] + A = (t[:, None] > self.s) * 1 + if self.pool_type == 'complete': + pool_group = np.zeros(len(X), dtype=np.int) + else: + pool_group = X[self.pool_cols].map(invert_dict(self.groups_)) + + idx = np.arange(len(t)) + delta = trace[self._param_name("delta")][idx, pool_group] + k = trace[self._param_name("k")][idx, pool_group] + m = trace[self._param_name("m")][idx, pool_group] + A = (t[:, None] > self.s) * 1 + + gamma = np.zeros(delta.T.shape) + for i in range(gamma.shape[0]): + gamma[i] = ( + (self.s[i] - m - gamma[:i].sum(axis=0)) * + (1 - ((k + delta[:, :i].sum(axis=1)) / (k + delta[:, :i + 1].sum(axis=1)))).T + ) + + g = ( + (k + A @ delta.T) * + (t[:, None] - (m + A @ gamma)) + ) + return self.cap_scaled / (1 + np.exp(-g)) + def _plot_predict(self, trace, t, pool_group=0): delta = trace[self._param_name("delta")][:, pool_group] k = trace[self._param_name("k")][:, pool_group] m = trace[self._param_name("m")][:, pool_group] @@ -98,6 +124,7 @@ def _predict(self, trace, t, pool_group=0): (self.s[i] - m - gamma[:i].sum(axis=0)) * (1 - ((k + delta[:, :i].sum(axis=1)) / (k + delta[:, :i+1].sum(axis=1)))).T ) + g = ( (k + A @ delta.T) * (t[:, None] - (m + A @ gamma)) @@ -110,7 +137,7 @@ def plot(self, trace, scaled_t, y_scaler): ax.set_xticks([]) growth_return = np.empty((len(scaled_t), len(self.groups_))) for group_code, group_name in self.groups_.items(): - scaled_growth = self._predict(trace, scaled_t, group_code) + scaled_growth = self._plot_predict(trace, scaled_t, group_code) growth = y_scaler.inv_transform(scaled_growth) ax.plot(scaled_t, growth.mean(axis=1), label=group_name) growth_return[:, group_code] = scaled_growth.mean(axis=1) diff --git a/src/timeseers/regressor.py b/src/timeseers/regressor.py index e177336..22c3e0e 100644 --- a/src/timeseers/regressor.py +++ b/src/timeseers/regressor.py @@ -1,6 +1,6 @@ import numpy as np from timeseers.timeseries_model import TimeSeriesModel -from timeseers.utils import add_subplot, get_group_definition +from timeseers.utils import add_subplot, get_group_definition, invert_dict import pymc3 as pm @@ -31,7 +31,17 @@ def definition(self, model, X, scale_factor): return k[group, X[self.on].cat.codes] - def _predict(self, trace, t, pool_group=0): + def _predict(self, trace, X): + t = X['t'] + if self.pool_type == 'complete': + pool_group = np.zeros(len(X), dtype=np.int) + else: + pool_group = X[self.pool_cols].map(invert_dict(self.groups_)) + + ind = trace[self._param_name("k")][np.arange(len(t)), pool_group] + return np.ones_like(t)[:, None] * ind.reshape(1, -1) * X[self.on] + + def _plot_predict(self, trace, t, pool_group=0): ind = trace[self._param_name("k")][:, pool_group] return np.ones_like(t)[:, None] * ind.reshape(1, -1) @@ -43,7 +53,7 @@ def plot(self, trace, scaled_t, y_scaler): trend_return = np.empty((len(scaled_t), len(self.groups_))) plot_data = [] for group_code, group_name in self.groups_.items(): - y_hat = np.mean(self._predict(trace, scaled_t, group_code), axis=1) + y_hat = np.mean(self._plot_predict(trace, scaled_t, group_code), axis=1) trend_return[:, group_code] = y_hat plot_data.append((group_name, y_hat[0])) ax.bar(*zip(*plot_data)) diff --git a/src/timeseers/timeseries_model.py b/src/timeseers/timeseries_model.py index 310cd34..864d690 100644 --- a/src/timeseers/timeseries_model.py +++ b/src/timeseers/timeseries_model.py @@ -1,6 +1,6 @@ import pandas as pd import pymc3 as pm -from timeseers.utils import MinMaxScaler, StdScaler, add_subplot +from timeseers.utils import MinMaxScaler, StdScaler, add_subplot, cartesian_product from timeseers.likelihood import Gaussian import numpy as np from abc import ABC, abstractmethod @@ -29,6 +29,25 @@ def fit(self, X, y, X_scaler=MinMaxScaler, y_scaler=StdScaler, likelihood=None, likelihood.observed(mu, y_scaled) self.trace_ = pm.sample(**sample_kwargs) + def predict(self, X, ci_percentiles=(0.08, 0.5, 0.92)): + X_to_scale = X.select_dtypes(exclude='category') + X_scaled = self._X_scaler_.transform(X_to_scale) + X_scaled = X_scaled.join(X.select_dtypes('category')) + y_hat_scaled = self._predict(self.trace_, X_scaled) + + # TODO: We only take the uncertainty of the parameters here, still need to add the uncertainty + # from the likelihood as well + + y_hat = self._y_scaler_.inv_transform(y_hat_scaled) + return pd.DataFrame( + np.percentile( + y_hat, + ci_percentiles, + axis=1 + ).T, + columns=[f"perc_{p}" for p in ci_percentiles] + ) + def plot_components(self, X_true=None, y_true=None, groups=None, fig=None): import matplotlib.pyplot as plt @@ -41,7 +60,12 @@ def plot_components(self, X_true=None, y_true=None, groups=None, fig=None): t = pd.date_range(t_min, t_max, freq='D') scaled_t = np.linspace(0, 1 + lookahead_scale, len(t)) - total = self.plot(self.trace_, scaled_t, self._y_scaler_) + X = pd.DataFrame({'t': scaled_t}) + + for col, val in self._groups().items(): + X = cartesian_product(X, pd.DataFrame({col: list(val.values())})) + + total = self.plot(self.trace_, X, self._y_scaler_) ax = add_subplot() ax.set_title("overall") @@ -61,6 +85,10 @@ def plot_components(self, X_true=None, y_true=None, groups=None, fig=None): def plot(self, trace, t, y_scaler): pass + @abstractmethod + def _predict(self, trace, X): + pass + @abstractmethod def definition(self, model, X_scaled, scale_factor): pass @@ -68,6 +96,11 @@ def definition(self, model, X_scaled, scale_factor): def _param_name(self, param): return f"{self.name}-{param}" + def _groups(self): + if self.pool_type == "complete": + return {} + return {self.pool_cols: self.groups_} + def __add__(self, other): return AdditiveTimeSeries(self, other) @@ -82,17 +115,29 @@ class AdditiveTimeSeries(TimeSeriesModel): def __init__(self, left, right): self.left = left self.right = right + self.name = "AdditiveTimeSeries" super().__init__() def definition(self, *args, **kwargs): - return self.left.definition(*args, **kwargs) + self.right.definition( - *args, **kwargs + return ( + self.left.definition(*args, **kwargs) + + self.right.definition(*args, **kwargs) + ) + + def _predict(self, trace, x_scaled): + return ( + self.left._predict(trace, x_scaled) + + self.right._predict(trace, x_scaled) + ) + + def plot(self, trace, X, y_scaler): + return ( + self.left.plot(trace, X, y_scaler) + + self.right.plot(trace, X, y_scaler) ) - def plot(self, *args, **kwargs): - left = self.left.plot(*args, **kwargs) - right = self.right.plot(*args, **kwargs) - return left + right + def _groups(self): + return {**self.left._groups(), ** self.right._groups()} def __repr__(self): return ( @@ -107,17 +152,29 @@ class MultiplicativeTimeSeries(TimeSeriesModel): def __init__(self, left, right): self.left = left self.right = right + self.name = "MultiplicativeTimeSeries" super().__init__() def definition(self, *args, **kwargs): - return self.left.definition(*args, **kwargs) * ( - 1 + self.right.definition(*args, **kwargs) + return ( + self.left.definition(*args, **kwargs) * + (1 + self.right.definition(*args, **kwargs)) + ) + + def _predict(self, trace, x_scaled): + return ( + self.left._predict(trace, x_scaled) * + (1 + self.right._predict(trace, x_scaled)) + ) + + def plot(self, trace, X, y_scaler): + return ( + self.left.plot(trace, X, y_scaler) * + (1 + self.right.plot(trace, X, y_scaler)) ) - def plot(self, trace, scaled_t, y_scaler): - left = self.left.plot(trace, scaled_t, y_scaler) - right = self.right.plot(trace, scaled_t, y_scaler) - return left + (left * right) + def _groups(self): + return {**self.left._groups(), ** self.right._groups()} def __repr__(self): return ( diff --git a/src/timeseers/utils.py b/src/timeseers/utils.py index a6d9eae..f936f39 100644 --- a/src/timeseers/utils.py +++ b/src/timeseers/utils.py @@ -170,6 +170,10 @@ def X(t, p=365.25, n=10): ) +def invert_dict(dct): + return {v: k for k, v in dct.items()} + + def get_group_definition(X, pool_cols, pool_type): if pool_type == 'complete': group = np.zeros(len(X), dtype='int') @@ -180,3 +184,7 @@ def get_group_definition(X, pool_cols, pool_type): group_mapping = dict(enumerate(X[pool_cols].cat.categories)) n_groups = X[pool_cols].nunique() return group, n_groups, group_mapping + + +def cartesian_product(df1, df2): + return df1.assign(merge_key=1).merge(df2.assign(merge_key=1), on="merge_key").drop(columns="merge_key")