Skip to content

Commit dbfa4e1

Browse files
authored
Merge pull request #412 from bsc-wdc/Simulation
Added Simulation Grid Search and its tests
2 parents bd55f6e + eb5e2cf commit dbfa4e1

File tree

12 files changed

+462
-8
lines changed

12 files changed

+462
-8
lines changed

dislib/classification/csvm/base.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -443,7 +443,7 @@ def save_model(self, filepath, overwrite=True, save_format="json"):
443443

444444
sync_obj(self.__dict__)
445445
model_metadata = self.__dict__
446-
model_metadata["model_name"] = "kmeans"
446+
model_metadata["model_name"] = "csvm"
447447

448448
# Save model
449449
if save_format == "json":

dislib/classification/knn/base.py

+162
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,17 @@
77
from dislib.data.array import Array
88
from dislib.neighbors import NearestNeighbors
99
from sklearn.metrics import accuracy_score
10+
from sklearn.neighbors import NearestNeighbors as SKNeighbors
11+
from sklearn.neighbors import KDTree
1012

1113
from collections import defaultdict
1214

15+
import os
16+
import json
17+
import dislib.data.util.model as utilmodel
18+
import pickle
19+
from dislib.data.util import sync_obj, decoder_helper, encoder_helper
20+
1321

1422
class KNeighborsClassifier(BaseEstimator):
1523
"""Classifier implementing the k-nearest neighbors vote.
@@ -136,6 +144,104 @@ def score(self, q: Array, y: Array, collect=False):
136144

137145
return compss_wait_on(score) if collect else score
138146

147+
def save_model(self, filepath, overwrite=True, save_format="json"):
148+
"""Saves a model to a file.
149+
The model is synchronized before saving and can be reinstantiated
150+
in the exact same state, without any of the code used for model
151+
definition or fitting.
152+
Parameters
153+
----------
154+
filepath : str
155+
Path where to save the model
156+
overwrite : bool, optional (default=True)
157+
Whether any existing model at the target
158+
location should be overwritten.
159+
save_format : str, optional (default='json)
160+
Format used to save the models.
161+
Examples
162+
--------
163+
>>> from dislib.classification import KNeighborsClassifier
164+
>>> import numpy as np
165+
>>> import dislib as ds
166+
>>> data = np.array([[0, 0, 5], [3, 0, 5], [3, 1, 2]])
167+
>>> y_data = np.array([2, 1, 1, 2, 0])
168+
>>> train = ds.array(x=ratings, block_size=(1, 1))
169+
>>> knn = KNeighborsClassifier()
170+
>>> knn.fit(train)
171+
>>> knn.save_model("./model_KNN")
172+
"""
173+
174+
# Check overwrite
175+
if not overwrite and os.path.isfile(filepath):
176+
return
177+
178+
sync_obj(self.__dict__)
179+
model_metadata = self.__dict__
180+
model_metadata["model_name"] = "knn"
181+
182+
# Save model
183+
if save_format == "json":
184+
with open(filepath, "w") as f:
185+
json.dump(model_metadata, f, default=_encode_helper)
186+
elif save_format == "cbor":
187+
if utilmodel.cbor2 is None:
188+
raise ModuleNotFoundError("No module named 'cbor2'")
189+
with open(filepath, "wb") as f:
190+
utilmodel.cbor2.dump(model_metadata, f,
191+
default=_encode_helper_cbor)
192+
elif save_format == "pickle":
193+
with open(filepath, "wb") as f:
194+
pickle.dump(model_metadata, f)
195+
else:
196+
raise ValueError("Wrong save format.")
197+
198+
def load_model(self, filepath, load_format="json"):
199+
"""Loads a model from a file.
200+
The model is reinstantiated in the exact same state in which it was
201+
saved, without any of the code used for model definition or fitting.
202+
Parameters
203+
----------
204+
filepath : str
205+
Path of the saved the model
206+
load_format : str, optional (default='json')
207+
Format used to load the model.
208+
Examples
209+
--------
210+
>>> from dislib.clasiffication import KNeighborsClassifier
211+
>>> import numpy as np
212+
>>> import dislib as ds
213+
>>> x_data = np.array([[1, 2], [2, 0], [3, 1], [4, 4], [5, 3]])
214+
>>> y_data = np.array([2, 1, 1, 2, 0])
215+
>>> x_test_m = np.array([[3, 2], [4, 4], [1, 3]])
216+
>>> bn, bm = 2, 2
217+
>>> x = ds.array(x=x_data, block_size=(bn, bm))
218+
>>> y = ds.array(x=y_data, block_size=(bn, 1))
219+
>>> test_data_m = ds.array(x=x_test_m, block_size=(bn, bm))
220+
>>> knn = KNeighborsClassifier()
221+
>>> knn.fit(x, y)
222+
>>> knn.save_model("./model_KNN")
223+
>>> knn_loaded = KNeighborsClassifier()
224+
>>> knn_loaded.load_model("./model_KNN")
225+
>>> pred = knn_loaded.predict(test_data).collect()
226+
"""
227+
# Load model
228+
if load_format == "json":
229+
with open(filepath, "r") as f:
230+
model_metadata = json.load(f, object_hook=_decode_helper)
231+
elif load_format == "cbor":
232+
if utilmodel.cbor2 is None:
233+
raise ModuleNotFoundError("No module named 'cbor2'")
234+
with open(filepath, "rb") as f:
235+
model_metadata = utilmodel.cbor2. \
236+
load(f, object_hook=_decode_helper_cbor)
237+
elif load_format == "pickle":
238+
with open(filepath, "rb") as f:
239+
model_metadata = pickle.load(f)
240+
else:
241+
raise ValueError("Wrong load format.")
242+
for key, val in model_metadata.items():
243+
setattr(self, key, val)
244+
139245

140246
@constraint(computing_units="${ComputingUnits}")
141247
@task(ind_blocks={Type: COLLECTION_IN, Depth: 2},
@@ -180,3 +286,59 @@ def _get_score(y_blocks, ypred_blocks):
180286
y_pred = Array._merge_blocks(ypred_blocks).flatten()
181287

182288
return accuracy_score(y, y_pred)
289+
290+
291+
def _decode_helper_cbor(decoder, obj):
292+
"""Special decoder wrapper for dislib using cbor2."""
293+
return _decode_helper(obj)
294+
295+
296+
def _decode_helper(obj):
297+
if isinstance(obj, dict) and "class_name" in obj:
298+
class_name = obj["class_name"]
299+
if class_name == "NearestNeighbors":
300+
nn = NearestNeighbors(obj["n_neighbors"])
301+
nn.__setstate__(_decode_helper(obj["items"]))
302+
return nn
303+
elif class_name == "SKNeighbors":
304+
dict_ = _decode_helper(obj["items"])
305+
model = SKNeighbors()
306+
model.__setstate__(dict_)
307+
return model
308+
elif class_name == "KDTree":
309+
dict_ = _decode_helper(obj["items"])
310+
model = KDTree(dict_[0])
311+
return model
312+
else:
313+
decoded = decoder_helper(class_name, obj)
314+
if decoded is not None:
315+
return decoded
316+
return obj
317+
318+
319+
def _encode_helper_cbor(encoder, obj):
320+
encoder.encode(_encode_helper(obj))
321+
322+
323+
def _encode_helper(obj):
324+
encoded = encoder_helper(obj)
325+
if encoded is not None:
326+
return encoded
327+
elif isinstance(obj, SKNeighbors):
328+
return {
329+
"class_name": "SKNeighbors",
330+
"n_neighbors": obj.n_neighbors,
331+
"radius": obj.radius,
332+
"items": obj.__getstate__(),
333+
}
334+
elif isinstance(obj, KDTree):
335+
return {
336+
"class_name": "KDTree",
337+
"items": obj.__getstate__(),
338+
}
339+
elif isinstance(obj, NearestNeighbors):
340+
return {
341+
"class_name": obj.__class__.__name__,
342+
"n_neighbors": obj.n_neighbors,
343+
"items": obj.__getstate__(),
344+
}

dislib/cluster/gm/base.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -589,7 +589,7 @@ def save_model(self, filepath, overwrite=True, save_format="json"):
589589

590590
sync_obj(self.__dict__)
591591
model_metadata = self.__dict__
592-
model_metadata["model_name"] = "kmeans"
592+
model_metadata["model_name"] = "gm"
593593

594594
# Save model
595595
if save_format == "json":

dislib/model_selection/__init__.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
11
from dislib.model_selection._search import GridSearchCV, RandomizedSearchCV
2+
from dislib.model_selection._simulation import SimulationGridSearch
23
from dislib.model_selection._split import KFold
34

4-
__all__ = ['GridSearchCV', 'RandomizedSearchCV', 'KFold']
5+
__all__ = ['GridSearchCV', 'RandomizedSearchCV', 'KFold',
6+
'SimulationGridSearch']

dislib/model_selection/_simulation.py

+118
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,118 @@
1+
from pycompss.api.api import compss_wait_on
2+
from sklearn.model_selection import ParameterGrid
3+
from dislib.model_selection._validation import simulation_execution
4+
from collections import defaultdict
5+
from functools import partial
6+
from numpy.ma import MaskedArray
7+
from scipy.stats import rankdata
8+
import numpy as np
9+
10+
11+
class SimulationGridSearch():
12+
def __init__(self, estimator, param_grid,
13+
sim_number=1, order="max"):
14+
self.estimator = estimator
15+
self.param_grid = param_grid
16+
self.sim_number = sim_number
17+
self.order = order
18+
self.raw_results = None
19+
self.cv_results_ = None
20+
21+
def _run_search(self, evaluate_candidates):
22+
"""Abstract method to perform the search. The parameter
23+
`evaluate_candidates` is a function that evaluates a ParameterGrid at a
24+
time """
25+
evaluate_candidates(ParameterGrid(self.param_grid))
26+
27+
def fit(self, x, y=None, **fit_params):
28+
"""Run fit with all sets of parameters.
29+
30+
Parameters
31+
----------
32+
x : ds-array
33+
Training data samples.
34+
y : ds-array, optional (default = None)
35+
Training data labels or values.
36+
**fit_params : dict of string -> object
37+
Parameters passed to the ``fit`` method of the estimator
38+
"""
39+
estimator = self.estimator
40+
all_candidate_params = []
41+
all_out = []
42+
43+
def evaluate_candidates_simulation(candidate_params):
44+
candidate_params = list(candidate_params)
45+
fits = []
46+
for parameters in candidate_params:
47+
fits.append(simulation_execution(
48+
estimator, parameters=parameters,
49+
simulation_params=fit_params,
50+
number_simulations=self.sim_number))
51+
out = [simulation_results for simulation_results in fits]
52+
out = compss_wait_on(out)
53+
all_candidate_params.extend(candidate_params)
54+
all_out.extend(out)
55+
56+
if callable(estimator):
57+
self._run_search(evaluate_candidates_simulation)
58+
else:
59+
raise NotImplementedError("The simulation needs to "
60+
"be contained on a function")
61+
62+
self.raw_results = all_out
63+
results = self._format_results(all_candidate_params, all_out,
64+
order=self.order,
65+
sim_number=self.sim_number)
66+
67+
self.best_index_ = results["rank_test_simulation"].argmin()
68+
self.best_score_ = results["mean_test_simulation"][self.best_index_]
69+
self.best_params_ = results["params"][self.best_index_]
70+
71+
self.cv_results_ = results
72+
73+
return self
74+
75+
@staticmethod
76+
def _format_results(candidate_params, out, order="max", sim_number=1):
77+
n_candidates = len(candidate_params)
78+
test_scores = out
79+
results = {}
80+
81+
def _store(key_name, array, rank=False):
82+
"""A small helper to store the scores/times to
83+
the cv_results_"""
84+
array = np.array(array, dtype=np.float64).reshape(
85+
n_candidates, sim_number)
86+
if sim_number > 0:
87+
for i in range(sim_number):
88+
results["results_%d_%s" % (i, key_name)] = array[:, i]
89+
array_means = np.mean(array, axis=1)
90+
array_stds = np.std(array, axis=1)
91+
results['mean_%s' % key_name] = array_means
92+
results['std_%s' % key_name] = array_stds
93+
if rank:
94+
if order == "max":
95+
results["rank_%s" % key_name] = np.asarray(
96+
rankdata(-array_means, method='min'), dtype=np.int32)
97+
else:
98+
results["rank_%s" % key_name] = np.asarray(
99+
rankdata(array_means, method='min'), dtype=np.int32)
100+
# Use one MaskedArray and mask all the places where the param is not
101+
# applicable for that candidate. Use defaultdict as each candidate may
102+
# not contain all the params
103+
param_results = defaultdict(partial(MaskedArray,
104+
np.empty(n_candidates, ),
105+
mask=True,
106+
dtype=object))
107+
for cand_i, params in enumerate(candidate_params):
108+
for name, value in params.items():
109+
# An all masked empty array gets created for the key
110+
# `"param_%s" % name` at the first occurrence of `name`.
111+
# Setting the value at an index also unmasks that index
112+
param_results["param_%s" % name][cand_i] = value
113+
114+
results.update(param_results)
115+
# Store a list of param dicts at the key 'params'
116+
results['params'] = candidate_params
117+
_store('test_simulation', test_scores, rank=True)
118+
return results

dislib/model_selection/_validation.py

+19-1
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
from dislib.data.array import Array
44
from pycompss.api.task import task
55
from pycompss.api.parameter import INOUT, Depth, Type, COLLECTION_IN
6-
6+
import sys
77
import numpy as np
88

99

@@ -39,6 +39,24 @@ def score_sklearn_estimator(est, scorer, blocks_x, blocks_y):
3939
return _score(est, x, y, scorer)
4040

4141

42+
def execute_simulation(simulation, **parameters):
43+
sys.stdout.write("PARAMETERS")
44+
for param in parameters:
45+
sys.stdout.write(str(param))
46+
return simulation(**parameters)
47+
48+
49+
def simulation_execution(simulation, parameters,
50+
simulation_params, number_simulations):
51+
simulations_result = []
52+
if parameters is not None:
53+
for _ in range(number_simulations):
54+
simulations_result.append(execute_simulation(simulation,
55+
**parameters,
56+
**simulation_params))
57+
return simulations_result
58+
59+
4260
def sklearn_fit(estimator, train_ds,
4361
parameters, fit_params):
4462
if parameters is not None:

dislib/neighbors/base.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ def fit(self, x):
5151
for row in x._iterator(axis=0):
5252
sknnstruct = _compute_fit(row._blocks)
5353
n_samples = row.shape[0]
54-
self._fit_data.append((sknnstruct, n_samples))
54+
self._fit_data.append([sknnstruct, n_samples])
5555

5656
return self
5757

dislib/recommendation/als/base.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -252,7 +252,7 @@ def save_model(self, filepath, overwrite=True, save_format="json"):
252252

253253
sync_obj(self.__dict__)
254254
model_metadata = self.__dict__
255-
model_metadata["model_name"] = "kmeans"
255+
model_metadata["model_name"] = "als"
256256

257257
# Save model
258258
if save_format == "json":

dislib/regression/lasso/base.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -165,7 +165,7 @@ def save_model(self, filepath, overwrite=True, save_format="json"):
165165

166166
sync_obj(self.__dict__)
167167
model_metadata = self.__dict__
168-
model_metadata["model_name"] = "kmeans"
168+
model_metadata["model_name"] = "lasso"
169169

170170
# Save model
171171
if save_format == "json":

dislib/regression/linear/base.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -168,7 +168,7 @@ def save_model(self, filepath, overwrite=True, save_format="json"):
168168

169169
sync_obj(self.__dict__)
170170
model_metadata = self.__dict__
171-
model_metadata["model_name"] = "kmeans"
171+
model_metadata["model_name"] = "linear"
172172

173173
# Save model
174174
if save_format == "json":

0 commit comments

Comments
 (0)