99"""
1010
1111import os
12+ import warnings
13+
1214import numpy as np
13- from sklearn .metrics import pairwise_distances
1415from sklearn .base import BaseEstimator , ClusterMixin
15- from sklearn .utils .validation import check_array , validate_data , check_random_state
16+ from sklearn .metrics import pairwise_distances
17+ from sklearn .utils .validation import check_random_state , validate_data
1618
1719from radius_clustering .utils ._emos import py_emos_main
1820from radius_clustering .utils ._mds_approx import solve_mds
2123
2224
2325class RadiusClustering (ClusterMixin , BaseEstimator ):
24- """
26+ r """
2527 Radius Clustering algorithm.
2628
2729 This class implements clustering based on the Minimum Dominating Set (MDS) problem.
@@ -46,29 +48,52 @@ class RadiusClustering(ClusterMixin, BaseEstimator):
4648 The maximum distance between any point and its assigned cluster center.
4749 random_state\_ : int | None
4850 The random state used for reproducibility. If None, no random state is set.
49-
51+
5052 .. note::
5153 The `random_state_` attribute is not used when the `manner` is set to "exact".
52-
54+
5355 .. versionadded:: 1.3.0
54- The *random_state* parameter was added to allow reproducibility in the approximate method.
56+ The *random_state* parameter was added to allow reproducibility in
57+ the approximate method.
5558
5659 .. versionchanged:: 1.3.0
57- All publicly accessible attributes are now suffixed with an underscore (e.g., `centers_`, `labels_`).
60+ All publicly accessible attributes are now suffixed with an underscore
61+ (e.g., `centers_`, `labels_`).
5862 This is particularly useful for compatibility with scikit-learn's API.
59-
60- .. versionchanged:: 1.3.0
61- The `threshold` parameter was renamed to `radius` to better reflect its purpose.
63+
64+ .. versionadded:: 1.3.0
65+ The `radius` parameter replaces the `threshold` parameter for setting
66+ the dissimilarity threshold for better clarity and consistency.
67+
68+ .. deprecated:: 1.3.0
69+ The `threshold` parameter is deprecated. Use `radius` instead.
70+ Will be removed in a future version.
6271 """
6372
6473 _estimator_type = "clusterer"
6574
66- def __init__ (self , manner : str = "approx" , radius : float = 0.5 , random_state : int | None = None ) -> None :
75+ def __init__ (
76+ self ,
77+ manner : str = "approx" ,
78+ radius : float = 0.5 ,
79+ threshold = None ,
80+ random_state : int | None = None ,
81+ ) -> None :
82+ if threshold is not None :
83+ warnings .warn (
84+ "The 'threshold' parameter is deprecated and"
85+ " will be removed in a future version."
86+ "Please use 'radius' instead." ,
87+ DeprecationWarning ,
88+ stacklevel = 2 ,
89+ )
90+ radius = threshold
91+ self .threshold = threshold # For backward compatibility
6792 self .manner = manner
6893 self .radius = radius
6994 self .random_state = random_state
7095
71- def _check_symmetric (self , a : np .ndarray , tol : float = 1e-8 ) -> bool :
96+ def _check_symmetric (self , a : np .ndarray , tol : float = 1e-8 ) -> bool :
7297 if a .ndim != 2 :
7398 raise ValueError ("Input must be a 2D array." )
7499 if a .shape [0 ] != a .shape [1 ]:
@@ -80,21 +105,26 @@ def fit(self, X: np.ndarray, y: None = None) -> "RadiusClustering":
80105 Fit the MDS clustering model to the input data.
81106
82107 This method computes the distance matrix if the input is a feature matrix,
83- or uses the provided distance matrix directly if the input is already a distance matrix.
108+ or uses the provided distance matrix directly if the input is already
109+ a distance matrix.
84110
85111 .. note::
86112 If the input is a distance matrix, it should be symmetric and square.
87- If the input is a feature matrix, the distance matrix will be computed using Euclidean distance.
88-
113+ If the input is a feature matrix, the distance matrix
114+ will be computed using Euclidean distance.
115+
89116 .. tip::
90- Next version will support providing different metrics or even custom callables to compute the distance matrix.
117+ Next version will support providing different metrics or
118+ even custom callables to compute the distance matrix.
91119
92120 Parameters:
93121 -----------
94122 X : array-like, shape (n_samples, n_features)
95- The input data to cluster. X should be a 2D array-like structure. It can either be :
123+ The input data to cluster. X should be a 2D array-like structure.
124+ It can either be :
96125 - A distance matrix (symmetric, square) with shape (n_samples, n_samples).
97- - A feature matrix with shape (n_samples, n_features) where the distance matrix will be computed.
126+ - A feature matrix with shape (n_samples, n_features)
127+ where the distance matrix will be computed.
98128 y : Ignored
99129 Not used, present here for API consistency by convention.
100130
@@ -128,7 +158,7 @@ def fit(self, X: np.ndarray, y: None = None) -> "RadiusClustering":
128158 dist_mat = pairwise_distances (self .X_checked_ , metric = "euclidean" )
129159 else :
130160 dist_mat = self .X_checked_
131-
161+
132162 if not isinstance (self .radius , (float , int )):
133163 raise ValueError ("Radius must be a positive float." )
134164 if self .radius <= 0 :
@@ -141,7 +171,9 @@ def fit(self, X: np.ndarray, y: None = None) -> "RadiusClustering":
141171 self .effective_radius_ = 0
142172 self .mds_exec_time_ = 0
143173 return self
144- self .edges_ = np .argwhere (adj_mask ).astype (np .uint32 ) # Edges in the adjacency matrix
174+ self .edges_ = np .argwhere (adj_mask ).astype (
175+ np .uint32
176+ ) # Edges in the adjacency matrix
145177 # uint32 is used to use less memory. Max number of features is 2^32-1
146178 self .dist_mat_ = dist_mat
147179
@@ -160,9 +192,11 @@ def fit_predict(self, X: np.ndarray, y: None = None) -> np.ndarray:
160192 Parameters:
161193 -----------
162194 X : array-like, shape (n_samples, n_features)
163- The input data to cluster. X should be a 2D array-like structure. It can either be :
195+ The input data to cluster. X should be a 2D array-like structure.
196+ It can either be :
164197 - A distance matrix (symmetric, square) with shape (n_samples, n_samples).
165- - A feature matrix with shape (n_samples, n_features) where the distance matrix will be computed.
198+ - A feature matrix with shape (n_samples, n_features) where
199+ the distance matrix will be computed.
166200 y : Ignored
167201 Not used, present here for API consistency by convention.
168202
@@ -181,9 +215,7 @@ def _clustering(self):
181215 n = self .X_checked_ .shape [0 ]
182216 if self .manner != "exact" and self .manner != "approx" :
183217 print (f"Invalid manner: { self .manner } . Defaulting to 'approx'." )
184- raise ValueError (
185- "Invalid manner. Choose either 'exact' or 'approx'."
186- )
218+ raise ValueError ("Invalid manner. Choose either 'exact' or 'approx'." )
187219 if self .manner == "exact" :
188220 self ._clustering_exact (n )
189221 else :
@@ -210,20 +242,27 @@ def _clustering_exact(self, n: int) -> None:
210242
211243 def _clustering_approx (self , n : int ) -> None :
212244 """
213- Perform approximate MDS clustering. This method uses a pretty trick to set the seed for the random state of the C++ code of the MDS solver.
245+ Perform approximate MDS clustering.
246+ This method uses a pretty trick to set the seed for
247+ the random state of the C++ code of the MDS solver.
214248
215249 .. tip::
216- The random state is used to ensure reproducibility of the results when using the approximate method.
250+ The random state is used to ensure reproducibility of the results
251+ when using the approximate method.
217252 If `random_state` is None, a default value of 42 is used.
218-
253+
219254 .. important::
220255 :collapsible: closed
221256 The trick to set the random state is :
222- 1. Use the `check_random_state` function to get a `RandomState`singleton instance, set up with the provided `random_state`.
223- 2. Use the `randint` method of the `RandomState` instance to generate a random integer.
257+ 1. Use the `check_random_state` function to get a `RandomState`singleton
258+ instance, set up with the provided `random_state`.
259+ 2. Use the `randint` method of the `RandomState` instance to generate a
260+ random integer.
224261 3. Use this random integer as the seed for the C++ code of the MDS solver.
225262
226- This ensures that the seed passed to the C++ code is always an integer, which is required by the MDS solver, and allows for reproducibility of the results.
263+ This ensures that the seed passed to the C++ code is always an integer,
264+ which is required by the MDS solver, and allows for
265+ reproducibility of the results.
227266
228267 Parameters:
229268 -----------
@@ -239,7 +278,9 @@ def _clustering_approx(self, n: int) -> None:
239278 self .random_state = 42
240279 self .random_state_ = check_random_state (self .random_state )
241280 seed = self .random_state_ .randint (np .iinfo (np .int32 ).max )
242- result = solve_mds (n , self .edges_ .flatten ().astype (np .int32 ), self .nb_edges_ , seed )
281+ result = solve_mds (
282+ n , self .edges_ .flatten ().astype (np .int32 ), self .nb_edges_ , seed
283+ )
243284 self .centers_ = sorted ([x for x in result ["solution_set" ]])
244285 self .mds_exec_time_ = result ["Time" ]
245286
0 commit comments