Merge pull request #200 from kinisi-dev/reconditioning

arm61 · web-flow · commit 5b3066902788 · 2025-12-19T14:24:41.000Z
Add new matrix reconditioning approach.
diff --git a/docs/source/condition_number.ipynb b/docs/source/condition_number.ipynb
@@ -101,9 +101,58 @@
     "But unfortunately, it is not always possible to achieve a low enough condition number. \n",
     "Therefore, we need to find a different solution. \n",
     "\n",
-    "So far, the best solution that we have found has been to change the time intervals, `dt`, that is used.\n",
+    "So far, the best solution that we have found has been to recondition the matrix in an effort to reduce the noise. \n",
+    "We are working on a full assessment of the effectiveness of this approach and hope to have a preprint on the subject soon. \n",
+    "To use the reconditioning, run. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "diff.diffusion(1.5 * sc.Unit('ps'), recondition=True)\n",
+    "\n",
+    "fig, ax = plt.subplots()\n",
+    "ax.plot(diff.dt.values, diff.msd.values, 'k-')\n",
+    "for i, ci in enumerate(credible_intervals):\n",
+    "    ax.fill_between(diff.dt.values,\n",
+    "                      *np.percentile(diff.distributions, ci, axis=1),\n",
+    "                      alpha=alpha[i],\n",
+    "                      color='#0173B2',\n",
+    "                      lw=0)\n",
+    "ax.set_xlabel(f'Time / {diff.dt.unit}')\n",
+    "ax.set_ylabel(f'MSD / {diff.msd.unit}')\n",
+    "ax.set_xlim(0, None)\n",
+    "ax.set_ylim(0, None)\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "You will notice that this can significantly reduce the condition number. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "np.linalg.cond(diff.diff.covariance_matrix.values)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "However, this is not always successful. \n",
+    "The next approach to resolve this issue is to change the time intervals, dt, that is used. \n",
     "Because we estimate the full covariance matrix, within reason, computing the MSD at every possible time interval is usually overkill. \n",
-    "Instead, we can use a longer time interval sets, currently the time intervals are 0.02 ps apart. "
+    "Instead, we can use a longer time interval sets, currently the time intervals are 0.02 ps apart."
    ]
   },
   {
@@ -188,7 +237,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "We welcome any user feedback on this problem, i.e., if you have a better why to mitigate it or remove it completely.  "
+    "We welcome any user feedback on this problem, i.e., if you have a better way to mitigate it or remove it completely.  "
    ]
   }
  ],
diff --git a/kinisi/diffusion.py b/kinisi/diffusion.py
@@ -11,9 +11,10 @@
 from emcee import EnsembleSampler
 from scipp.constants import k
 from scipy.linalg import pinvh
-from scipy.optimize import minimize
-from scipy.stats import linregress, multivariate_normal
+from scipy.optimize import curve_fit
+from scipy.stats import gaussian_kde, linregress, multivariate_normal
 from statsmodels.stats.correlation_tools import cov_nearest
+from statsmodels.stats.moment_helpers import corr2cov, cov2corr
 from tqdm import tqdm
 
 from kinisi import __version__
@@ -93,6 +94,7 @@ def bayesian_regression(
         self,
         start_dt: sc.Variable,
         cond_max: float = 1e16,
+        recondition: bool = False,
         fit_intercept: bool = True,
         n_samples: int = 1000,
         n_walkers: int = 32,
@@ -106,6 +108,7 @@ def bayesian_regression(
 
         :param start_dt: The time at which the diffusion regime begins.
         :param cond_max: The maximum condition number of the covariance matrix. Optional, default is :py:attr:`1e16`.
+        :param recondition: Whether to recondition the covariance matrix. Optional, default is :py:attr:`False`.
         :param fit_intercept: Whether to fit an intercept. Optional, default is :py:attr:`True`.
         :param n_samples: The number of MCMC samples to take. Optional, default is :py:attr:`1000`.
         :param n_walkers: The number of walkers to use in the MCMC. Optional, default is :py:attr:`32`.
@@ -119,6 +122,7 @@ def bayesian_regression(
 
         self._start_dt = start_dt
         self._cond_max = cond_max
+        self._recondition = recondition
 
         self.diff_regime = np.argwhere(self.dg['da'].coords['time interval'] >= self._start_dt)[0][0]
         self._covariance_matrix = self.compute_covariance_matrix()
@@ -149,17 +153,7 @@ def log_likelihood(theta: np.ndarray) -> float:
         if slope < 0:
             slope = 1e-20
 
-        def nll(*args) -> float:
-            """
-            General purpose negative log-likelihood.
-            :return: Negative log-likelihood
-            """
-            return -log_likelihood(*args)
-
-        if fit_intercept:
-            max_likelihood = minimize(nll, np.array([slope, intercept])).x
-        else:
-            max_likelihood = minimize(nll, np.array([slope])).x
+        max_likelihood = np.array([slope, intercept])
 
         pos = max_likelihood + max_likelihood * 1e-3 * np.random.randn(n_walkers, max_likelihood.size)
         sampler = EnsembleSampler(*pos.shape, log_likelihood)
@@ -169,7 +163,6 @@ def nll(*args) -> float:
         #     sampler._random = random_state
         sampler.run_mcmc(pos, n_samples + n_burn, progress=progress, progress_kwargs={'desc': 'Likelihood Sampling'})
         self._flatchain = sampler.get_chain(flat=True, thin=n_thin, discard=n_burn)
-
         self.gradient = Samples(
             self._flatchain[:, 0], unit=(self.dg['da'].unit / self.dg['da'].coords['time interval'].unit)
         )
@@ -258,11 +251,20 @@ def compute_covariance_matrix(self) -> sc.Variable:
                 value = ratio * self.dg['da'].data.variances[i]
                 cov[i, j] = value
                 cov[j, i] = np.copy(cov[i, j])
-        return sc.array(
-            dims=['time_interval1', 'time_interval2'],
-            values=cov_nearest(minimum_eigenvalue_method(cov[self.diff_regime :, self.diff_regime :], self._cond_max)),
-            unit=self.dg['da'].unit ** 2,
-        )
+        if self._recondition:
+            return sc.array(
+                dims=['time_interval1', 'time_interval2'],
+                values=cov_nearest(eigenvalue_clipping(cov_nearest(cov[self.diff_regime :, self.diff_regime :]))),
+                unit=self.dg['da'].unit ** 2,
+            )
+        else:
+            return sc.array(
+                dims=['time_interval1', 'time_interval2'],
+                values=cov_nearest(
+                    minimum_eigenvalue_method(cov[self.diff_regime :, self.diff_regime :], self._cond_max)
+                ),
+                unit=self.dg['da'].unit ** 2,
+            )
 
     def posterior_predictive(
         self, n_posterior_samples: int = None, n_predictive_samples: int = 256, progress: bool = True
@@ -338,6 +340,62 @@ def minimum_eigenvalue_method(cov: np.ndarray, cond_max=1e16) -> np.ndarray:
     return new_cov
 
 
+def eigenvalue_clipping(cov: np.ndarray) -> np.ndarray:
+    """
+    Eigenvalue clipping method for matrix reconditioning.
+
+    :param cov: Covariance matrix to recondition.
+
+    :return: Reconditioned covariance matrix.
+    """
+    corr = cov2corr(cov)
+    eigenthings = np.linalg.eig(corr)
+    eigenvalues = eigenthings.eigenvalues.real
+
+    kde = gaussian_kde(eigenvalues)
+    x = np.linspace(eigenvalues.min() - 0.5 * eigenvalues.max(), eigenvalues.max() + 0.5 * eigenvalues.max(), 10000)
+
+    popt, _ = curve_fit(marchenkopastur, x, kde.pdf(x), bounds=([0, 0], [np.inf, np.inf]), p0=[0.5, 1.0])
+
+    lambda_plus = (1 + popt[0] ** 0.5) ** 2
+    lambda_minus = (1 - popt[0] ** 0.5) ** 2
+
+    lambda_minus = np.max(eigenvalues[eigenvalues < lambda_plus])
+    new_eigenvalues = np.copy(eigenvalues)
+    new_eigenvalues[new_eigenvalues < lambda_plus] = lambda_minus
+
+    new_corr = (eigenthings.eigenvectors @ np.diag(new_eigenvalues) @ np.linalg.inv(eigenthings.eigenvectors)).real
+    S = np.diag(1 / (np.diag(new_corr)) ** 0.5)
+    new_corr = S @ new_corr @ S.T
+
+    new_cov = corr2cov(new_corr, np.sqrt(cov.diagonal()))
+    return new_cov
+
+
+def marchenkopastur(x: np.ndarray, lambda_: float, sigma: float) -> np.ndarray:
+    """
+    Marchenko-Pastur distribution
+
+    :param x: points at which to evaluate the distribution
+    :param lambda_: lambda parameter
+    :param sigma: standard deviation of the distribution
+    """
+
+    def m0(a: np.ndarray) -> np.ndarray:
+        """
+        Element wise maximum of (a, 0)
+
+        :param a: input array
+        :return: element wise maximum of (a, 0)
+        """
+        return np.maximum(a, np.zeros_like(a))
+
+    lambda_plus = (1 + lambda_**0.5) ** 2
+    lambda_minus = (1 - lambda_**0.5) ** 2
+
+    return np.sqrt(m0(lambda_plus - x) * m0(x - lambda_minus)) / (2 * np.pi * sigma**2 * lambda_ * x)
+
+
 def _straight_line(abscissa: np.ndarray, gradient: float, intercept: float = 0.0) -> np.ndarray:
     """
     A one dimensional straight line function.
diff --git a/kinisi/diffusion_analyzer.py b/kinisi/diffusion_analyzer.py
@@ -223,6 +223,7 @@ def diffusion(
         self,
         start_dt: VariableLikeType,
         cond_max: float = 1e16,
+        recondition: bool = False,
         fit_intercept: bool = True,
         n_samples: int = 1000,
         n_walkers: int = 32,
@@ -236,6 +237,7 @@ def diffusion(
 
         :param start_dt: The time at which the diffusion regime begins.
         :param cond_max: The maximum condition number of the covariance matrix. Optional, default is :py:attr:`1e16`.
+        :param recondition: Whether to recondition the covariance matrix. Optional, default is :py:attr:`False`.
         :param fit_intercept: Whether to fit an intercept. Optional, default is :py:attr:`True`.
         :param n_samples: The number of MCMC samples to take. Optional, default is :py:attr:`1000`.
         :param n_walkers: The number of walkers to use in the MCMC. Optional, default is :py:attr:`32`.
@@ -248,6 +250,7 @@ def diffusion(
         self.diff._diffusion(
             start_dt,
             cond_max=cond_max,
+            recondition=recondition,
             fit_intercept=fit_intercept,
             n_samples=n_samples,
             n_walkers=n_walkers,
diff --git a/kinisi/tests/test_diffusion.py b/kinisi/tests/test_diffusion.py
@@ -12,7 +12,7 @@
 import pytest
 import scipp as sc
 
-from kinisi.diffusion import Diffusion, _straight_line, minimum_eigenvalue_method
+from kinisi.diffusion import Diffusion, _straight_line, eigenvalue_clipping, marchenkopastur, minimum_eigenvalue_method
 from kinisi.tests import TEST_FILE_PATH
 
 # Random seed setting not yet implemented into bayesian regression and so cannot almost_equal
@@ -36,6 +36,17 @@ def test_straight_line(self):
         expected_result = np.array([4.3, 7.3, 10.3])
         assert np.all(result == expected_result)
 
+    def test_eigenvalue_clipping(self):
+        matrix = np.random.random((100, 100)) + 100
+        reconditioned_matrix = eigenvalue_clipping(matrix)
+        assert not np.allclose(matrix, reconditioned_matrix)
+
+    def test_marchenkopastur(self):
+        x = np.linspace(1, 11, 10)
+        result = marchenkopastur(x, 2, 2)
+        actual = np.array([0.03978874, 0.02530364, 0.01740905, 0.01145199, 0.00519943, 0.0, 0.0, 0.0, 0.0, 0.0])
+        assert np.allclose(result, actual)
+
 
 class TestDiffusion(unittest.TestCase):
     @classmethod