diff --git a/docs/source/biblio.bib b/docs/source/biblio.bib index 4a15fd48..9b1438d6 100644 --- a/docs/source/biblio.bib +++ b/docs/source/biblio.bib @@ -2198,3 +2198,12 @@ @article{TRUONG2020107299 volume = {167}, year = {2020} } +@article{Banko2011, +author = {Banko, Z. and Dobos, L. and Abonyi, J.}, +journal = {Conservation, Information, Evolution - towards a sustainable engineering and economy}, +number = {1}, +pages = {11--24}, +title = {{Dynamic principal component analysis in multivariate time-series segmentation}}, +volume = {1}, +year = {2011} +} diff --git a/docs/source/costs/costqresiduals.rst b/docs/source/costs/costqresiduals.rst new file mode 100644 index 00000000..d913c8bb --- /dev/null +++ b/docs/source/costs/costqresiduals.rst @@ -0,0 +1 @@ +.. automodule:: ruptures.costs.costqresiduals diff --git a/docs/source/costs/costtsquared.rst b/docs/source/costs/costtsquared.rst new file mode 100644 index 00000000..4fa91929 --- /dev/null +++ b/docs/source/costs/costtsquared.rst @@ -0,0 +1 @@ +.. automodule:: ruptures.costs.costtsquared diff --git a/requirements.txt b/requirements.txt index 6c894dfd..eb76affd 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ numpy pre-commit -scipy +scikit-learn +scipy \ No newline at end of file diff --git a/ruptures/costs/__init__.py b/ruptures/costs/__init__.py index fa205684..8570da12 100644 --- a/ruptures/costs/__init__.py +++ b/ruptures/costs/__init__.py @@ -17,6 +17,8 @@ costautoregressive costml costrank + costqresiduals + costtsquared costcustom """ @@ -31,3 +33,5 @@ from .costautoregressive import CostAR from .costml import CostMl from .costrank import CostRank +from .costqresiduals import CostQResiduals +from .costtsquared import CostTSquared \ No newline at end of file diff --git a/ruptures/costs/costqresiduals.py b/ruptures/costs/costqresiduals.py new file mode 100644 index 00000000..f4a79d9c --- /dev/null +++ b/ruptures/costs/costqresiduals.py @@ -0,0 +1,141 @@ +r""" + +.. _sec-costqresiduals: + +Q reconstruction error based cost function +==================================================================================================== + +Description +---------------------------------------------------------------------------------------------------- + +This cost function detects change of the correlation between the variables with ignoring noisy demension :cite:`qresiduals-Banko2011`. +Formally, for a signal :math:`\{y_t\}_t` on an interval :math:`I`, the cost function is equal to + + .. math:: c(y_{I}) = \sum_{t\in I} \|y_t - \bar{y_t}\|_2 + +where :math:`\bar{y_t}` is the PCA based reconstructed value of :math:`\{y_t\}_{t\in I}` computed by + + .. math:: \bar{y_t} = {U_p}^T U_p y_t + +where :math:`U_p` is singular vectors belong to the most important :math:`p` singular values of :math:`y_t`. + +Usage +---------------------------------------------------------------------------------------------------- + +Start with the usual imports and create a signal. + +.. code-block:: python + + import numpy as np + import matplotlib.pylab as plt + import ruptures as rpt + # creation of data + n = 200 # number of samples + n_bkps = 3 # number of change points + n_noisy_features = 3 # number of noisy features + signal, bkps = rpt.pw_normal(n, n_bkps, n_noisy_features) + +Then create a :class:`CostQResiduals` instance and print the cost of the sub-signal :code:`signal[50:150]`. + +.. code-block:: python + + c = rpt.costs.CostQResiduals(n_components=1).fit(signal) + print(c.error(50, 150)) + +You can also compute the sum of costs for a given list of change points. + +.. code-block:: python + + print(c.sum_of_costs(bkps)) + print(c.sum_of_costs([10, 100, 200, 250, n])) + + +In order to use this cost class in a change point detection algorithm (inheriting from :class:`BaseEstimator`), either pass a :class:`CostNormal` instance (through the argument ``'custom_cost'``) or set :code:`model="qresid"`. + +.. code-block:: python + + c = rpt.costs.CostQResiduals(); algo = rpt.Dynp(custom_cost=c) + # is equivalent to + algo = rpt.Dynp(model="qresid") + + +Code explanation +---------------------------------------------------------------------------------------------------- + +.. autoclass:: ruptures.costs.CostQResiduals + :members: + :special-members: __init__ + +.. rubric:: References + +.. bibliography:: ../biblio.bib + :style: alpha + :cited: + :labelprefix: Q + :keyprefix: qresiduals- +""" + + +import numpy as np +from sklearn.decomposition import PCA + +from ruptures.base import BaseCost +from ruptures.costs import NotEnoughPoints + + +class CostQResiduals(BaseCost): + + """Q residuals.""" + + model = "qresid" + + def __init__(self, n_components=2): + """Create a new instance. + + Args: + n_components (int, optional): number of components to keep in signal in each segment. + + Returns: + self + """ + self.min_size = 2 + self.signal = None + self.n_components = n_components + + def fit(self, signal): + """Set parameters of the instance. + + Args: + signal (array): signal. Shape (n_samples,) or (n_samples, n_features) + + Returns: + self + """ + if signal.ndim == 1 or (signal.ndim == 2 and signal.shape[1] == 1): + raise ValueError("The signal must be multivariate.") + else: + self.signal = signal + + return self + + def error(self, start, end): + """Return the approximation cost on the segment [start:end]. + + Args: + start (int): start of the segment + end (int): end of the segment + + Returns: + float: segment cost + + Raises: + NotEnoughPoints: when the segment is too short (less than ``'min_size'`` samples). + """ + if end - start < self.min_size: + raise NotEnoughPoints + sub = self.signal[start:end] + + pca = PCA(self.n_components).fit(sub) + sub_reconstruct = pca.inverse_transform(pca.transform(sub)) + + return np.sum((sub - sub_reconstruct)**2) diff --git a/ruptures/costs/costtsquared.py b/ruptures/costs/costtsquared.py new file mode 100644 index 00000000..dea24f53 --- /dev/null +++ b/ruptures/costs/costtsquared.py @@ -0,0 +1,140 @@ +r""" +.. _sec-costtsquared: + +Hotelling's T2 statistic based cost function +==================================================================================================== + +Description +---------------------------------------------------------------------------------------------------- + +This cost function detects drift of the center of the signal :cite:`tsquared-Banko2011`. +Formally, for a signal :math:`\{y_t\}_t` on an interval :math:`I`, the cost function is equal to + + .. math:: c(y_{I}) = \sum_{t\in I} \|y_t - \bar{y_t}\|_2 + +where :math:`\bar{y_t}` is the deviation from the center of the data on lower dimensional representation of :math:`\{y_t\}_{t\in I}`, + + .. math:: \bar{y_t} = {U_p}^T y_t {y_t}^T {U_p} + +where :math:`U_p` is singular vectors belong to the most important :math:`p` singular values of :math:`\{y_t\}_{t\in I}`. + +Usage +---------------------------------------------------------------------------------------------------- + +Start with the usual imports and create a signal. + +.. code-block:: python + + import numpy as np + import matplotlib.pylab as plt + import ruptures as rpt + # creation of data + n, dim = 500, 3 # number of samples, dimension + n_bkps, sigma = 3, 5 # number of change points, noise standard deviation + signal, bkps = rpt.pw_constant(n, dim, n_bkps, noise_std=sigma) + +Then create a :class:`CostTSquared` instance and print the cost of the sub-signal :code:`signal[50:150]`. + +.. code-block:: python + + c = rpt.costs.CostTSquared(n_components=2).fit(signal) + print(c.error(50, 150)) + + +You can also compute the sum of costs for a given list of change points. + +.. code-block:: python + + print(c.sum_of_costs(bkps)) + print(c.sum_of_costs([10, 100, 200, 250, n])) + + +In order to use this cost class in a change point detection algorithm (inheriting from :class:`BaseEstimator`), either pass a :class:`CostTSquared` instance (through the argument ``'custom_cost'``) or set :code:`model="rank"`. + +.. code-block:: python + + c = rpt.costs.CostTSquared(); algo = rpt.Dynp(custom_cost=c) + # is equivalent to + algo = rpt.Dynp(model="t2") + + +Code explanation +---------------------------------------------------------------------------------------------------- + +.. autoclass:: ruptures.costs.CostTSquared + :members: + :special-members: __init__ + +.. rubric:: References + +.. bibliography:: ../biblio.bib + :style: alpha + :cited: + :labelprefix: T + :keyprefix: tsquared- +""" + + +import numpy as np +from sklearn.decomposition import PCA + +from ruptures.base import BaseCost +from ruptures.costs import NotEnoughPoints + + +class CostTSquared(BaseCost): + + """Hotelling's T-Squared.""" + + model = "t2" + + def __init__(self, n_components=2): + """Create a new instance. + + Args: + n_components (int, optional): number of components to keep in signal in each segment. + + Returns: + self + """ + self.min_size = 2 + self.signal = None + self.n_components = n_components + + def fit(self, signal): + """Set parameters of the instance. + + Args: + signal (array): signal. Shape (n_samples,) or (n_samples, n_features) + + Returns: + self + """ + if signal.ndim == 1 or (signal.ndim == 2 and signal.shape[1] == 1): + raise ValueError("The signal must be multivariate.") + else: + self.signal = signal + + return self + + def error(self, start, end): + """Return the approximation cost on the segment [start:end]. + + Args: + start (int): start of the segment + end (int): end of the segment + + Returns: + float: segment cost + + Raises: + NotEnoughPoints: when the segment is too short (less than ``'min_size'`` samples). + """ + if end - start < self.min_size: + raise NotEnoughPoints + sub = self.signal[start:end] + + pca = PCA(self.n_components) + sub_tr = pca.fit_transform(sub) + + return np.linalg.norm(sub_tr)**2 diff --git a/ruptures/datasets/pw_normal.py b/ruptures/datasets/pw_normal.py index d1c18ae7..939bbc8c 100644 --- a/ruptures/datasets/pw_normal.py +++ b/ruptures/datasets/pw_normal.py @@ -8,6 +8,7 @@ ---------------------------------------------------------------------------------------------------- This function simulates a 2D signal of Gaussian i.i.d. random variables with zero mean and covariance matrix alternating between :math:`[[1, 0.9], [0.9, 1]]` and :math:`[[1, -0.9], [-0.9, 1]]` at every change point. +Users can also append `noisy` dimensions (i.e. without change points). .. figure:: /images/correlation_shift.png :scale: 50 % @@ -47,12 +48,17 @@ from ruptures.utils import draw_bkps -def pw_normal(n_samples=200, n_bkps=3): +def pw_normal(n_samples=200, n_bkps=3, n_noisy_features=0): """Return a 2D piecewise Gaussian signal and the associated changepoints. + Users can also append `noisy` dimensions (i.e. without change points). Args: n_samples (int, optional): signal length n_bkps (int, optional): number of change points + n_noisy_features (int, optional): number of noisy dimensions. If not + zero, total dimensions of the signal will be 2 + `n_noisy_features` + and the last `n_noisy_features` dimensions will be white noise only + (no change point). Returns: tuple: signal of shape (n_samples, 2), list of breakpoints @@ -68,4 +74,8 @@ def pw_normal(n_samples=200, n_bkps=3): n_sub, _ = sub.shape sub += rd.multivariate_normal([0, 0], cov, size=n_sub) + # Add noise + if n_noisy_features > 0: + signal = np.c_[signal, np.random.randn(n_samples, n_noisy_features)] + return signal, bkps diff --git a/setup.py b/setup.py index a8590319..b8fda62d 100644 --- a/setup.py +++ b/setup.py @@ -4,7 +4,7 @@ name="ruptures", version="1.0.5", packages=find_packages(exclude=["docs", "tests*", "images"]), - install_requires=["numpy", "scipy"], + install_requires=["numpy", "scikit-learn", "scipy"], extras_require={"display": ["matplotlib"]}, python_requires=">=3", # url='ctruong.perso.math.cnrs.fr/ruptures',