From 9048f43f168ddee79e6827c95a7f6be9771a84c7 Mon Sep 17 00:00:00 2001 From: mbriner Date: Sat, 19 Jun 2021 12:16:39 -0400 Subject: [PATCH 1/6] sketch out diverse k-means implementation --- modAL/batch.py | 84 +++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 83 insertions(+), 1 deletion(-) diff --git a/modAL/batch.py b/modAL/batch.py index 38fa732..5d8b2bf 100644 --- a/modAL/batch.py +++ b/modAL/batch.py @@ -6,11 +6,12 @@ import numpy as np import scipy.sparse as sp +from sklearn.cluster import KMeans from sklearn.metrics.pairwise import pairwise_distances, pairwise_distances_argmin_min from modAL.utils.data import data_vstack, modALinput, data_shape from modAL.models.base import BaseCommittee, BaseLearner -from modAL.uncertainty import classifier_uncertainty +from modAL.uncertainty import classifier_margin, classifier_uncertainty def select_cold_start_instance(X: modALinput, @@ -216,3 +217,84 @@ def uncertainty_batch_sampling(classifier: Union[BaseLearner, BaseCommittee], return ranked_batch(classifier, unlabeled=X, uncertainty_scores=uncertainty, n_instances=n_instances, metric=metric, n_jobs=n_jobs) + +def kmeans_batch(classifier: Union[BaseLearner, BaseCommittee], + unlabeled: modALinput, + uncertainty_scores: np.ndarray, + n_instances: int, + n_jobs: Union[int, None]) -> np.ndarray: + """ + Query our top :n_instances: to request for labeling. + + Refer to Cardoso et al.'s "Ranked batch-mode active learning": + https://www.sciencedirect.com/science/article/pii/S0020025516313949 + + Args: + classifier: One of modAL's supported active learning models. + unlabeled: Set of records to be considered for our active learning model. + uncertainty_scores: Our classifier's predictions over the response variable. + n_instances: Limit on the number of records to query from our unlabeled set. + metric: This parameter is passed to :func:`~sklearn.metrics.pairwise.pairwise_distances`. + n_jobs: This parameter is passed to :func:`~sklearn.metrics.pairwise.pairwise_distances`. + + Returns: + The indices of the top n_instances ranked unlabelled samples. + """ + # Make a local copy of our classifier's training data. + # Define our record container and record the best cold start instance in the case of cold start. + + # transform unlabeled data if needed + if classifier.on_transformed: + unlabeled = classifier.transform_without_estimating(unlabeled) + + if classifier.X_training is None: + # TODO: Random or diversity-based? + return + + kmeans = KMeans(n_clusters=n_instances) + kmeans.fit(unlabeled, sample_weight=uncertainty_scores) + min_distances = np.min(kmeans.transform(unlabeled), axis=1) + + return np.argsort(min_distances)[:n_instances] + + +def diverse_batch_kmeans(classifier: Union[BaseLearner, BaseCommittee], + X: Union[np.ndarray, sp.csr_matrix], + n_instances: int = 20, + n_jobs: Optional[int] = None, + filter_param: int = 10, + **uncertainty_measure_kwargs + ) -> np.ndarray: + """ + Batch sampling query strategy. Selects the least sure instances for labelling. + + This strategy differs from :func:`~modAL.uncertainty.uncertainty_sampling` because, although it is supported, + traditional active learning query strategies suffer from sub-optimal record selection when passing + `n_instances` > 1. This sampling strategy extends the interactive uncertainty query sampling by allowing for + batch-mode uncertainty query sampling. Furthermore, it also enforces a ranking -- that is, which records among the + batch are most important for labeling? + + Refer to Cardoso et al.'s "Ranked batch-mode active learning": + https://www.sciencedirect.com/science/article/pii/S0020025516313949 + + Args: + classifier: One of modAL's supported active learning models. + X: Set of records to be considered for our active learning model. + n_instances: Number of records to return for labeling from `X`. + n_jobs: If not set, :func:`~sklearn.metrics.pairwise.pairwise_distances_argmin_min` is used for calculation of + distances between samples. Otherwise it is passed to :func:`~sklearn.metrics.pairwise.pairwise_distances`. + **uncertainty_measure_kwargs: Keyword arguments to be passed for the :meth:`predict_proba` of the classifier. + + Returns: + Indices of the instances from `X` chosen to be labelled; records from `X` chosen to be labelled. + """ + uncertainty = classifier_margin(classifier, X, **uncertainty_measure_kwargs) + unlabeled_batch = kmeans_batch( + classifier, + unlabeled=X, + uncertainty_scores=uncertainty, + n_instances=n_instances, + filter_param=filter_param, + n_jobs=n_jobs + ) + return unlabeled_batch \ No newline at end of file From 81c349eeeee9c346dc797ba9292e6b5726e8b574 Mon Sep 17 00:00:00 2001 From: mbriner Date: Sat, 19 Jun 2021 12:27:47 -0400 Subject: [PATCH 2/6] update some comment and docstrings --- modAL/batch.py | 49 ++++++++++++++++++++++--------------------------- 1 file changed, 22 insertions(+), 27 deletions(-) diff --git a/modAL/batch.py b/modAL/batch.py index 5d8b2bf..ad12979 100644 --- a/modAL/batch.py +++ b/modAL/batch.py @@ -218,31 +218,30 @@ def uncertainty_batch_sampling(classifier: Union[BaseLearner, BaseCommittee], n_instances=n_instances, metric=metric, n_jobs=n_jobs) -def kmeans_batch(classifier: Union[BaseLearner, BaseCommittee], - unlabeled: modALinput, - uncertainty_scores: np.ndarray, - n_instances: int, - n_jobs: Union[int, None]) -> np.ndarray: +def kmeans_batch( + classifier: Union[BaseLearner, BaseCommittee], + unlabeled: modALinput, + uncertainty_scores: np.ndarray, + n_instances: int, + filter_param: int, +) -> np.ndarray: """ Query our top :n_instances: to request for labeling. - Refer to Cardoso et al.'s "Ranked batch-mode active learning": - https://www.sciencedirect.com/science/article/pii/S0020025516313949 + Refer to Zhadanov's "Diverse mini-batch Active Learning": + https://arxiv.org/pdf/1901.05954.pdf Args: classifier: One of modAL's supported active learning models. unlabeled: Set of records to be considered for our active learning model. uncertainty_scores: Our classifier's predictions over the response variable. n_instances: Limit on the number of records to query from our unlabeled set. - metric: This parameter is passed to :func:`~sklearn.metrics.pairwise.pairwise_distances`. - n_jobs: This parameter is passed to :func:`~sklearn.metrics.pairwise.pairwise_distances`. + filter_param: Controls number of examples to use for sampling. Limits K-Means dataset to top + `n_instances * filter_param` most informative examples Returns: - The indices of the top n_instances ranked unlabelled samples. + The indices of the top n_instances unlabelled samples. """ - # Make a local copy of our classifier's training data. - # Define our record container and record the best cold start instance in the case of cold start. - # transform unlabeled data if needed if classifier.on_transformed: unlabeled = classifier.transform_without_estimating(unlabeled) @@ -261,32 +260,29 @@ def kmeans_batch(classifier: Union[BaseLearner, BaseCommittee], def diverse_batch_kmeans(classifier: Union[BaseLearner, BaseCommittee], X: Union[np.ndarray, sp.csr_matrix], n_instances: int = 20, - n_jobs: Optional[int] = None, filter_param: int = 10, **uncertainty_measure_kwargs ) -> np.ndarray: """ - Batch sampling query strategy. Selects the least sure instances for labelling. + Batch sampling query strategy that tries to consider both diversity and informativeness. - This strategy differs from :func:`~modAL.uncertainty.uncertainty_sampling` because, although it is supported, - traditional active learning query strategies suffer from sub-optimal record selection when passing - `n_instances` > 1. This sampling strategy extends the interactive uncertainty query sampling by allowing for - batch-mode uncertainty query sampling. Furthermore, it also enforces a ranking -- that is, which records among the - batch are most important for labeling? + This strategy uses weighted K-Means (the weights being some uncertainty measure) to determine + a batch of samples to label that are both informative and diverse. Margin-based uncertainty + has been found to perform best, so that is what we use here. - Refer to Cardoso et al.'s "Ranked batch-mode active learning": - https://www.sciencedirect.com/science/article/pii/S0020025516313949 + Refer to Zhadanov's "Diverse mini-batch Active Learning": + https://arxiv.org/pdf/1901.05954.pdf Args: classifier: One of modAL's supported active learning models. X: Set of records to be considered for our active learning model. n_instances: Number of records to return for labeling from `X`. - n_jobs: If not set, :func:`~sklearn.metrics.pairwise.pairwise_distances_argmin_min` is used for calculation of - distances between samples. Otherwise it is passed to :func:`~sklearn.metrics.pairwise.pairwise_distances`. + filter_param: Controls number of examples to use for sampling. Limits K-Means dataset to top + `n_instances * filter_param` most informative examples **uncertainty_measure_kwargs: Keyword arguments to be passed for the :meth:`predict_proba` of the classifier. Returns: - Indices of the instances from `X` chosen to be labelled; records from `X` chosen to be labelled. + Indices of the instances from `X` chosen to be labelled """ uncertainty = classifier_margin(classifier, X, **uncertainty_measure_kwargs) unlabeled_batch = kmeans_batch( @@ -294,7 +290,6 @@ def diverse_batch_kmeans(classifier: Union[BaseLearner, BaseCommittee], unlabeled=X, uncertainty_scores=uncertainty, n_instances=n_instances, - filter_param=filter_param, - n_jobs=n_jobs + filter_param=filter_param ) return unlabeled_batch \ No newline at end of file From 0f89ab35c2eb9e307bca2b06ef4852f9dfc1dccd Mon Sep 17 00:00:00 2001 From: mbriner Date: Sat, 19 Jun 2021 16:17:29 -0400 Subject: [PATCH 3/6] make filter_param actually do something --- modAL/batch.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/modAL/batch.py b/modAL/batch.py index ad12979..b412c06 100644 --- a/modAL/batch.py +++ b/modAL/batch.py @@ -242,6 +242,12 @@ def kmeans_batch( Returns: The indices of the top n_instances unlabelled samples. """ + # Limit data set based on n_instances and filter_param + record_limit = filter_param * n_instances + keep_args = np.argsort(uncertainty_scores)[-record_limit:] + uncertainty_scores = uncertainty_scores[keep_args] + unlabeled = unlabeled[keep_args] + # transform unlabeled data if needed if classifier.on_transformed: unlabeled = classifier.transform_without_estimating(unlabeled) From f2321cb5d980e343fc737e283269ba02ff216c9b Mon Sep 17 00:00:00 2001 From: mbriner Date: Mon, 21 Jun 2021 21:21:27 -0400 Subject: [PATCH 4/6] update tests --- modAL/batch.py | 18 ++--- tests/core_tests.py | 10 ++- tests/example_tests/diverse_batch_kmeans.py | 79 +++++++++++++++++++++ 3 files changed, 95 insertions(+), 12 deletions(-) create mode 100644 tests/example_tests/diverse_batch_kmeans.py diff --git a/modAL/batch.py b/modAL/batch.py index b412c06..68239ba 100644 --- a/modAL/batch.py +++ b/modAL/batch.py @@ -242,21 +242,21 @@ def kmeans_batch( Returns: The indices of the top n_instances unlabelled samples. """ - # Limit data set based on n_instances and filter_param - record_limit = filter_param * n_instances - keep_args = np.argsort(uncertainty_scores)[-record_limit:] - uncertainty_scores = uncertainty_scores[keep_args] - unlabeled = unlabeled[keep_args] # transform unlabeled data if needed if classifier.on_transformed: unlabeled = classifier.transform_without_estimating(unlabeled) - if classifier.X_training is None: - # TODO: Random or diversity-based? - return + # Limit data set based on n_instances and filter_param + record_limit = filter_param * n_instances + keep_args = np.argsort(uncertainty_scores)[-record_limit:] + uncertainty_scores = uncertainty_scores[keep_args] + unlabeled = unlabeled[keep_args] - kmeans = KMeans(n_clusters=n_instances) + # Avoids ValueErrors when we try to sample more instances than we have data points + n_clusters = min(n_instances, unlabeled.shape[0]) + + kmeans = KMeans(n_clusters=n_clusters) kmeans.fit(unlabeled, sample_weight=uncertainty_scores) min_distances = np.min(kmeans.transform(unlabeled), axis=1) diff --git a/tests/core_tests.py b/tests/core_tests.py index 1ed4f95..217c58d 100644 --- a/tests/core_tests.py +++ b/tests/core_tests.py @@ -799,7 +799,8 @@ def test_on_transformed(self): n_samples = 10 n_features = 5 query_strategies = [ - modAL.batch.uncertainty_batch_sampling + modAL.batch.uncertainty_batch_sampling, + modAL.batch.diverse_batch_kmeans, # add further strategies which work with instance representations # no further ones as of 25.09.2020 ] @@ -831,7 +832,8 @@ def test_on_transformed_with_variable_transformation(self): properly for on_transformed=True query strategies. """ query_strategies = [ - modAL.batch.uncertainty_batch_sampling + modAL.batch.uncertainty_batch_sampling, + modAL.batch.diverse_batch_kmeans, # add further strategies which work with instance representations # no further ones as of 09.12.2020 ] @@ -1152,7 +1154,8 @@ def test_on_transformed(self): n_samples = 10 n_features = 5 query_strategies = [ - modAL.batch.uncertainty_batch_sampling + modAL.batch.uncertainty_batch_sampling, + modAL.batch.diverse_batch_kmeans, # add further strategies which work with instance representations # no further ones as of 25.09.2020 ] @@ -1318,6 +1321,7 @@ def test_examples(self): import example_tests.information_density import example_tests.bayesian_optimization import example_tests.ranked_batch_mode + import example_tests.diverse_batch_kmeans if __name__ == '__main__': diff --git a/tests/example_tests/diverse_batch_kmeans.py b/tests/example_tests/diverse_batch_kmeans.py new file mode 100644 index 0000000..1fe5606 --- /dev/null +++ b/tests/example_tests/diverse_batch_kmeans.py @@ -0,0 +1,79 @@ +import numpy as np +from sklearn.datasets import load_iris +from sklearn.decomposition import PCA +from sklearn.neighbors import KNeighborsClassifier +from functools import partial + +from modAL.batch import diverse_batch_kmeans +from modAL.models import ActiveLearner + +# Set our RNG for reproducibility. +RANDOM_STATE_SEED = 123 +np.random.seed(RANDOM_STATE_SEED) + +iris = load_iris() +X_raw = iris['data'] +y_raw = iris['target'] + +# Define our PCA transformer and fit it onto our raw dataset. +pca = PCA(n_components=2, random_state=RANDOM_STATE_SEED) +transformed_iris = pca.fit_transform(X=X_raw) + +# Isolate the data we'll need for plotting. +x_component, y_component = transformed_iris[:, 0], transformed_iris[:, 1] + +# Isolate our examples for our labeled dataset. +n_labeled_examples = X_raw.shape[0] +training_indices = np.random.randint(low=0, high=n_labeled_examples + 1, size=3) + +X_train = X_raw[training_indices] +y_train = y_raw[training_indices] + +# Isolate the non-training examples we'll be querying. +X_pool = np.delete(X_raw, training_indices, axis=0) +y_pool = np.delete(y_raw, training_indices, axis=0) + +# Pre-set our batch sampling to retrieve 3 samples at a time. +BATCH_SIZE = 3 +preset_batch = partial(diverse_batch_kmeans, n_instances=BATCH_SIZE) + +# Testing the cold-start +learner = ActiveLearner( + estimator=KNeighborsClassifier(n_neighbors=3), + query_strategy=preset_batch +) +cold_start_idx, cold_start_inst = learner.query(X_raw) +learner.teach(X_raw[cold_start_idx], y_raw[cold_start_idx]) + +# Specify our active learning model. +learner = ActiveLearner( + estimator=KNeighborsClassifier(n_neighbors=3), + X_training=X_train, + y_training=y_train, + query_strategy=preset_batch +) + +predictions = learner.predict(X_raw) + +# Record our learner's score on the raw data. +unqueried_score = learner.score(X_raw, y_raw) + +# Pool-based sampling +N_RAW_SAMPLES = 20 +N_QUERIES = N_RAW_SAMPLES // BATCH_SIZE + +for index in range(N_QUERIES): + query_index, query_instance = learner.query(X_pool) + + # Teach our ActiveLearner model the record it has requested. + X, y = X_pool[query_index], y_pool[query_index] + learner.teach(X=X, y=y) + + # Remove the queried instance from the unlabeled pool. + X_pool = np.delete(X_pool, query_index, axis=0) + y_pool = np.delete(y_pool, query_index) + + # Calculate and report our model's accuracy. + model_accuracy = learner.score(X_raw, y_raw) + +predictions = learner.predict(X_raw) \ No newline at end of file From 22fadcecaf6ff02d1542225aeb3235b66bd25200 Mon Sep 17 00:00:00 2001 From: mbriner Date: Mon, 21 Jun 2021 21:24:07 -0400 Subject: [PATCH 5/6] bump scikit-learn version to support weighted kmeans --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index c3f2b60..5af5bdf 100644 --- a/setup.py +++ b/setup.py @@ -10,5 +10,5 @@ url='https://modAL-python.github.io/', packages=['modAL', 'modAL.models', 'modAL.utils'], classifiers=['Development Status :: 4 - Beta'], - install_requires=['numpy>=1.13', 'scikit-learn>=0.18', 'scipy>=0.18', 'pandas>=1.1.0'], + install_requires=['numpy>=1.13', 'scikit-learn>=0.20', 'scipy>=0.18', 'pandas>=1.1.0'], ) From 52afb6aeefa11ef99147b4520a2193beaeaeeccb Mon Sep 17 00:00:00 2001 From: mbriner Date: Mon, 21 Jun 2021 22:30:47 -0400 Subject: [PATCH 6/6] modify to return points closest to each cluster center, not just any cluster center --- modAL/batch.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/modAL/batch.py b/modAL/batch.py index 68239ba..25a0a43 100644 --- a/modAL/batch.py +++ b/modAL/batch.py @@ -255,12 +255,13 @@ def kmeans_batch( # Avoids ValueErrors when we try to sample more instances than we have data points n_clusters = min(n_instances, unlabeled.shape[0]) - + + # Fit kmeans to data kmeans = KMeans(n_clusters=n_clusters) kmeans.fit(unlabeled, sample_weight=uncertainty_scores) - min_distances = np.min(kmeans.transform(unlabeled), axis=1) - return np.argsort(min_distances)[:n_instances] + # Return closest point to each cluster center + return np.argmin(kmeans.transform(unlabeled), axis=0) def diverse_batch_kmeans(classifier: Union[BaseLearner, BaseCommittee],