From 9048f43f168ddee79e6827c95a7f6be9771a84c7 Mon Sep 17 00:00:00 2001
From: mbriner <mitch.briner@fortive.com>
Date: Sat, 19 Jun 2021 12:16:39 -0400
Subject: [PATCH 1/6] sketch out diverse k-means implementation

---
 modAL/batch.py | 84 +++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 83 insertions(+), 1 deletion(-)

diff --git a/modAL/batch.py b/modAL/batch.py
index 38fa732..5d8b2bf 100644
--- a/modAL/batch.py
+++ b/modAL/batch.py
@@ -6,11 +6,12 @@
 
 import numpy as np
 import scipy.sparse as sp
+from sklearn.cluster import KMeans
 from sklearn.metrics.pairwise import pairwise_distances, pairwise_distances_argmin_min
 
 from modAL.utils.data import data_vstack, modALinput, data_shape
 from modAL.models.base import BaseCommittee, BaseLearner
-from modAL.uncertainty import classifier_uncertainty
+from modAL.uncertainty import classifier_margin, classifier_uncertainty
 
 
 def select_cold_start_instance(X: modALinput,
@@ -216,3 +217,84 @@ def uncertainty_batch_sampling(classifier: Union[BaseLearner, BaseCommittee],
     return ranked_batch(classifier, unlabeled=X, uncertainty_scores=uncertainty,
                                  n_instances=n_instances, metric=metric, n_jobs=n_jobs)
 
+
+def kmeans_batch(classifier: Union[BaseLearner, BaseCommittee],
+                 unlabeled: modALinput,
+                 uncertainty_scores: np.ndarray,
+                 n_instances: int,
+                 n_jobs: Union[int, None]) -> np.ndarray:
+    """
+    Query our top :n_instances: to request for labeling.
+
+    Refer to Cardoso et al.'s "Ranked batch-mode active learning":
+        https://www.sciencedirect.com/science/article/pii/S0020025516313949
+
+    Args:
+        classifier: One of modAL's supported active learning models.
+        unlabeled: Set of records to be considered for our active learning model.
+        uncertainty_scores: Our classifier's predictions over the response variable.
+        n_instances: Limit on the number of records to query from our unlabeled set.
+        metric: This parameter is passed to :func:`~sklearn.metrics.pairwise.pairwise_distances`.
+        n_jobs: This parameter is passed to :func:`~sklearn.metrics.pairwise.pairwise_distances`.
+
+    Returns:
+        The indices of the top n_instances ranked unlabelled samples.
+    """
+    # Make a local copy of our classifier's training data.
+    # Define our record container and record the best cold start instance in the case of cold start.
+
+    # transform unlabeled data if needed
+    if classifier.on_transformed:
+        unlabeled = classifier.transform_without_estimating(unlabeled)
+
+    if classifier.X_training is None:
+        # TODO: Random or diversity-based?
+        return
+
+    kmeans = KMeans(n_clusters=n_instances)
+    kmeans.fit(unlabeled, sample_weight=uncertainty_scores)
+    min_distances = np.min(kmeans.transform(unlabeled), axis=1)
+
+    return np.argsort(min_distances)[:n_instances]
+
+
+def diverse_batch_kmeans(classifier: Union[BaseLearner, BaseCommittee],
+                               X: Union[np.ndarray, sp.csr_matrix],
+                               n_instances: int = 20,
+                               n_jobs: Optional[int] = None,
+                               filter_param: int = 10,
+                               **uncertainty_measure_kwargs
+                               ) -> np.ndarray:
+    """
+    Batch sampling query strategy. Selects the least sure instances for labelling.
+
+    This strategy differs from :func:`~modAL.uncertainty.uncertainty_sampling` because, although it is supported,
+    traditional active learning query strategies suffer from sub-optimal record selection when passing
+    `n_instances` > 1. This sampling strategy extends the interactive uncertainty query sampling by allowing for
+    batch-mode uncertainty query sampling. Furthermore, it also enforces a ranking -- that is, which records among the
+    batch are most important for labeling?
+
+    Refer to Cardoso et al.'s "Ranked batch-mode active learning":
+        https://www.sciencedirect.com/science/article/pii/S0020025516313949
+
+    Args:
+        classifier: One of modAL's supported active learning models.
+        X: Set of records to be considered for our active learning model.
+        n_instances: Number of records to return for labeling from `X`.
+        n_jobs: If not set, :func:`~sklearn.metrics.pairwise.pairwise_distances_argmin_min` is used for calculation of
+            distances between samples. Otherwise it is passed to :func:`~sklearn.metrics.pairwise.pairwise_distances`.
+        **uncertainty_measure_kwargs: Keyword arguments to be passed for the :meth:`predict_proba` of the classifier.
+
+    Returns:
+        Indices of the instances from `X` chosen to be labelled; records from `X` chosen to be labelled.
+    """
+    uncertainty = classifier_margin(classifier, X, **uncertainty_measure_kwargs)
+    unlabeled_batch = kmeans_batch(
+        classifier,
+        unlabeled=X,
+        uncertainty_scores=uncertainty,
+        n_instances=n_instances,
+        filter_param=filter_param,
+        n_jobs=n_jobs
+    )
+    return unlabeled_batch
\ No newline at end of file

From 81c349eeeee9c346dc797ba9292e6b5726e8b574 Mon Sep 17 00:00:00 2001
From: mbriner <mitch.briner@fortive.com>
Date: Sat, 19 Jun 2021 12:27:47 -0400
Subject: [PATCH 2/6] update some comment and docstrings

---
 modAL/batch.py | 49 ++++++++++++++++++++++---------------------------
 1 file changed, 22 insertions(+), 27 deletions(-)

diff --git a/modAL/batch.py b/modAL/batch.py
index 5d8b2bf..ad12979 100644
--- a/modAL/batch.py
+++ b/modAL/batch.py
@@ -218,31 +218,30 @@ def uncertainty_batch_sampling(classifier: Union[BaseLearner, BaseCommittee],
                                  n_instances=n_instances, metric=metric, n_jobs=n_jobs)
 
 
-def kmeans_batch(classifier: Union[BaseLearner, BaseCommittee],
-                 unlabeled: modALinput,
-                 uncertainty_scores: np.ndarray,
-                 n_instances: int,
-                 n_jobs: Union[int, None]) -> np.ndarray:
+def kmeans_batch(
+    classifier: Union[BaseLearner, BaseCommittee],
+    unlabeled: modALinput,
+    uncertainty_scores: np.ndarray,
+    n_instances: int,
+    filter_param: int,
+) -> np.ndarray:
     """
     Query our top :n_instances: to request for labeling.
 
-    Refer to Cardoso et al.'s "Ranked batch-mode active learning":
-        https://www.sciencedirect.com/science/article/pii/S0020025516313949
+    Refer to Zhadanov's "Diverse mini-batch Active Learning":
+        https://arxiv.org/pdf/1901.05954.pdf
 
     Args:
         classifier: One of modAL's supported active learning models.
         unlabeled: Set of records to be considered for our active learning model.
         uncertainty_scores: Our classifier's predictions over the response variable.
         n_instances: Limit on the number of records to query from our unlabeled set.
-        metric: This parameter is passed to :func:`~sklearn.metrics.pairwise.pairwise_distances`.
-        n_jobs: This parameter is passed to :func:`~sklearn.metrics.pairwise.pairwise_distances`.
+        filter_param: Controls number of examples to use for sampling. Limits K-Means dataset to top
+            `n_instances * filter_param` most informative examples
 
     Returns:
-        The indices of the top n_instances ranked unlabelled samples.
+        The indices of the top n_instances unlabelled samples.
     """
-    # Make a local copy of our classifier's training data.
-    # Define our record container and record the best cold start instance in the case of cold start.
-
     # transform unlabeled data if needed
     if classifier.on_transformed:
         unlabeled = classifier.transform_without_estimating(unlabeled)
@@ -261,32 +260,29 @@ def kmeans_batch(classifier: Union[BaseLearner, BaseCommittee],
 def diverse_batch_kmeans(classifier: Union[BaseLearner, BaseCommittee],
                                X: Union[np.ndarray, sp.csr_matrix],
                                n_instances: int = 20,
-                               n_jobs: Optional[int] = None,
                                filter_param: int = 10,
                                **uncertainty_measure_kwargs
                                ) -> np.ndarray:
     """
-    Batch sampling query strategy. Selects the least sure instances for labelling.
+    Batch sampling query strategy that tries to consider both diversity and informativeness.
 
-    This strategy differs from :func:`~modAL.uncertainty.uncertainty_sampling` because, although it is supported,
-    traditional active learning query strategies suffer from sub-optimal record selection when passing
-    `n_instances` > 1. This sampling strategy extends the interactive uncertainty query sampling by allowing for
-    batch-mode uncertainty query sampling. Furthermore, it also enforces a ranking -- that is, which records among the
-    batch are most important for labeling?
+    This strategy uses weighted K-Means (the weights being some uncertainty measure) to determine
+    a batch of samples to label that are both informative and diverse. Margin-based uncertainty
+    has been found to perform best, so that is what we use here.
 
-    Refer to Cardoso et al.'s "Ranked batch-mode active learning":
-        https://www.sciencedirect.com/science/article/pii/S0020025516313949
+    Refer to Zhadanov's "Diverse mini-batch Active Learning":
+        https://arxiv.org/pdf/1901.05954.pdf
 
     Args:
         classifier: One of modAL's supported active learning models.
         X: Set of records to be considered for our active learning model.
         n_instances: Number of records to return for labeling from `X`.
-        n_jobs: If not set, :func:`~sklearn.metrics.pairwise.pairwise_distances_argmin_min` is used for calculation of
-            distances between samples. Otherwise it is passed to :func:`~sklearn.metrics.pairwise.pairwise_distances`.
+        filter_param: Controls number of examples to use for sampling. Limits K-Means dataset to top
+            `n_instances * filter_param` most informative examples
         **uncertainty_measure_kwargs: Keyword arguments to be passed for the :meth:`predict_proba` of the classifier.
 
     Returns:
-        Indices of the instances from `X` chosen to be labelled; records from `X` chosen to be labelled.
+        Indices of the instances from `X` chosen to be labelled
     """
     uncertainty = classifier_margin(classifier, X, **uncertainty_measure_kwargs)
     unlabeled_batch = kmeans_batch(
@@ -294,7 +290,6 @@ def diverse_batch_kmeans(classifier: Union[BaseLearner, BaseCommittee],
         unlabeled=X,
         uncertainty_scores=uncertainty,
         n_instances=n_instances,
-        filter_param=filter_param,
-        n_jobs=n_jobs
+        filter_param=filter_param
     )
     return unlabeled_batch
\ No newline at end of file

From 0f89ab35c2eb9e307bca2b06ef4852f9dfc1dccd Mon Sep 17 00:00:00 2001
From: mbriner <mitch.briner@fortive.com>
Date: Sat, 19 Jun 2021 16:17:29 -0400
Subject: [PATCH 3/6] make filter_param actually do something

---
 modAL/batch.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/modAL/batch.py b/modAL/batch.py
index ad12979..b412c06 100644
--- a/modAL/batch.py
+++ b/modAL/batch.py
@@ -242,6 +242,12 @@ def kmeans_batch(
     Returns:
         The indices of the top n_instances unlabelled samples.
     """
+    # Limit data set based on n_instances and filter_param
+    record_limit = filter_param * n_instances
+    keep_args = np.argsort(uncertainty_scores)[-record_limit:]
+    uncertainty_scores = uncertainty_scores[keep_args]
+    unlabeled = unlabeled[keep_args]
+
     # transform unlabeled data if needed
     if classifier.on_transformed:
         unlabeled = classifier.transform_without_estimating(unlabeled)

From f2321cb5d980e343fc737e283269ba02ff216c9b Mon Sep 17 00:00:00 2001
From: mbriner <mitch.briner@fortive.com>
Date: Mon, 21 Jun 2021 21:21:27 -0400
Subject: [PATCH 4/6] update tests

---
 modAL/batch.py                              | 18 ++---
 tests/core_tests.py                         | 10 ++-
 tests/example_tests/diverse_batch_kmeans.py | 79 +++++++++++++++++++++
 3 files changed, 95 insertions(+), 12 deletions(-)
 create mode 100644 tests/example_tests/diverse_batch_kmeans.py

diff --git a/modAL/batch.py b/modAL/batch.py
index b412c06..68239ba 100644
--- a/modAL/batch.py
+++ b/modAL/batch.py
@@ -242,21 +242,21 @@ def kmeans_batch(
     Returns:
         The indices of the top n_instances unlabelled samples.
     """
-    # Limit data set based on n_instances and filter_param
-    record_limit = filter_param * n_instances
-    keep_args = np.argsort(uncertainty_scores)[-record_limit:]
-    uncertainty_scores = uncertainty_scores[keep_args]
-    unlabeled = unlabeled[keep_args]
 
     # transform unlabeled data if needed
     if classifier.on_transformed:
         unlabeled = classifier.transform_without_estimating(unlabeled)
 
-    if classifier.X_training is None:
-        # TODO: Random or diversity-based?
-        return
+    # Limit data set based on n_instances and filter_param
+    record_limit = filter_param * n_instances
+    keep_args = np.argsort(uncertainty_scores)[-record_limit:]
+    uncertainty_scores = uncertainty_scores[keep_args]
+    unlabeled = unlabeled[keep_args]
 
-    kmeans = KMeans(n_clusters=n_instances)
+    # Avoids ValueErrors when we try to sample more instances than we have data points
+    n_clusters = min(n_instances, unlabeled.shape[0])
+    
+    kmeans = KMeans(n_clusters=n_clusters)
     kmeans.fit(unlabeled, sample_weight=uncertainty_scores)
     min_distances = np.min(kmeans.transform(unlabeled), axis=1)
 
diff --git a/tests/core_tests.py b/tests/core_tests.py
index 1ed4f95..217c58d 100644
--- a/tests/core_tests.py
+++ b/tests/core_tests.py
@@ -799,7 +799,8 @@ def test_on_transformed(self):
         n_samples = 10
         n_features = 5
         query_strategies = [
-            modAL.batch.uncertainty_batch_sampling
+            modAL.batch.uncertainty_batch_sampling,
+            modAL.batch.diverse_batch_kmeans,
             # add further strategies which work with instance representations
             # no further ones as of 25.09.2020
         ]
@@ -831,7 +832,8 @@ def test_on_transformed_with_variable_transformation(self):
         properly for on_transformed=True query strategies.
         """
         query_strategies = [
-            modAL.batch.uncertainty_batch_sampling
+            modAL.batch.uncertainty_batch_sampling,
+            modAL.batch.diverse_batch_kmeans,
             # add further strategies which work with instance representations
             # no further ones as of 09.12.2020
         ]
@@ -1152,7 +1154,8 @@ def test_on_transformed(self):
         n_samples = 10
         n_features = 5
         query_strategies = [
-            modAL.batch.uncertainty_batch_sampling
+            modAL.batch.uncertainty_batch_sampling,
+            modAL.batch.diverse_batch_kmeans,
             # add further strategies which work with instance representations
             # no further ones as of 25.09.2020
         ]
@@ -1318,6 +1321,7 @@ def test_examples(self):
         import example_tests.information_density
         import example_tests.bayesian_optimization
         import example_tests.ranked_batch_mode
+        import example_tests.diverse_batch_kmeans
 
 
 if __name__ == '__main__':
diff --git a/tests/example_tests/diverse_batch_kmeans.py b/tests/example_tests/diverse_batch_kmeans.py
new file mode 100644
index 0000000..1fe5606
--- /dev/null
+++ b/tests/example_tests/diverse_batch_kmeans.py
@@ -0,0 +1,79 @@
+import numpy as np
+from sklearn.datasets import load_iris
+from sklearn.decomposition import PCA
+from sklearn.neighbors import KNeighborsClassifier
+from functools import partial
+
+from modAL.batch import diverse_batch_kmeans
+from modAL.models import ActiveLearner
+
+# Set our RNG for reproducibility.
+RANDOM_STATE_SEED = 123
+np.random.seed(RANDOM_STATE_SEED)
+
+iris = load_iris()
+X_raw = iris['data']
+y_raw = iris['target']
+
+# Define our PCA transformer and fit it onto our raw dataset.
+pca = PCA(n_components=2, random_state=RANDOM_STATE_SEED)
+transformed_iris = pca.fit_transform(X=X_raw)
+
+# Isolate the data we'll need for plotting.
+x_component, y_component = transformed_iris[:, 0], transformed_iris[:, 1]
+
+# Isolate our examples for our labeled dataset.
+n_labeled_examples = X_raw.shape[0]
+training_indices = np.random.randint(low=0, high=n_labeled_examples + 1, size=3)
+
+X_train = X_raw[training_indices]
+y_train = y_raw[training_indices]
+
+# Isolate the non-training examples we'll be querying.
+X_pool = np.delete(X_raw, training_indices, axis=0)
+y_pool = np.delete(y_raw, training_indices, axis=0)
+
+# Pre-set our batch sampling to retrieve 3 samples at a time.
+BATCH_SIZE = 3
+preset_batch = partial(diverse_batch_kmeans, n_instances=BATCH_SIZE)
+
+# Testing the cold-start
+learner = ActiveLearner(
+    estimator=KNeighborsClassifier(n_neighbors=3),
+    query_strategy=preset_batch
+)
+cold_start_idx, cold_start_inst = learner.query(X_raw)
+learner.teach(X_raw[cold_start_idx], y_raw[cold_start_idx])
+
+# Specify our active learning model.
+learner = ActiveLearner(
+    estimator=KNeighborsClassifier(n_neighbors=3),
+    X_training=X_train,
+    y_training=y_train,
+    query_strategy=preset_batch
+)
+
+predictions = learner.predict(X_raw)
+
+# Record our learner's score on the raw data.
+unqueried_score = learner.score(X_raw, y_raw)
+
+# Pool-based sampling
+N_RAW_SAMPLES = 20
+N_QUERIES = N_RAW_SAMPLES // BATCH_SIZE
+
+for index in range(N_QUERIES):
+    query_index, query_instance = learner.query(X_pool)
+
+    # Teach our ActiveLearner model the record it has requested.
+    X, y = X_pool[query_index], y_pool[query_index]
+    learner.teach(X=X, y=y)
+
+    # Remove the queried instance from the unlabeled pool.
+    X_pool = np.delete(X_pool, query_index, axis=0)
+    y_pool = np.delete(y_pool, query_index)
+
+    # Calculate and report our model's accuracy.
+    model_accuracy = learner.score(X_raw, y_raw)
+
+predictions = learner.predict(X_raw)
\ No newline at end of file

From 22fadcecaf6ff02d1542225aeb3235b66bd25200 Mon Sep 17 00:00:00 2001
From: mbriner <mitch.briner@fortive.com>
Date: Mon, 21 Jun 2021 21:24:07 -0400
Subject: [PATCH 5/6] bump scikit-learn version to support weighted kmeans

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index c3f2b60..5af5bdf 100644
--- a/setup.py
+++ b/setup.py
@@ -10,5 +10,5 @@
     url='https://modAL-python.github.io/',
     packages=['modAL', 'modAL.models', 'modAL.utils'],
     classifiers=['Development Status :: 4 - Beta'],
-    install_requires=['numpy>=1.13', 'scikit-learn>=0.18', 'scipy>=0.18', 'pandas>=1.1.0'],
+    install_requires=['numpy>=1.13', 'scikit-learn>=0.20', 'scipy>=0.18', 'pandas>=1.1.0'],
 )

From 52afb6aeefa11ef99147b4520a2193beaeaeeccb Mon Sep 17 00:00:00 2001
From: mbriner <mitch.briner@fortive.com>
Date: Mon, 21 Jun 2021 22:30:47 -0400
Subject: [PATCH 6/6] modify to return points closest to each cluster center,
 not just any cluster center

---
 modAL/batch.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/modAL/batch.py b/modAL/batch.py
index 68239ba..25a0a43 100644
--- a/modAL/batch.py
+++ b/modAL/batch.py
@@ -255,12 +255,13 @@ def kmeans_batch(
 
     # Avoids ValueErrors when we try to sample more instances than we have data points
     n_clusters = min(n_instances, unlabeled.shape[0])
-    
+
+    # Fit kmeans to data
     kmeans = KMeans(n_clusters=n_clusters)
     kmeans.fit(unlabeled, sample_weight=uncertainty_scores)
-    min_distances = np.min(kmeans.transform(unlabeled), axis=1)
 
-    return np.argsort(min_distances)[:n_instances]
+    # Return closest point to each cluster center
+    return np.argmin(kmeans.transform(unlabeled), axis=0)
 
 
 def diverse_batch_kmeans(classifier: Union[BaseLearner, BaseCommittee],