From fe1b1d31c93c592eb597474493f520b7f6b6434c Mon Sep 17 00:00:00 2001
From: Miles Granger <miles59923@gmail.com>
Date: Thu, 16 May 2019 12:43:32 +0200
Subject: [PATCH] Support user defined clustering function

Also update Example.ipynb with an example
---
 Example.ipynb             | 55 +++++++++++++++++++++++++++++++++++++--
 README.md                 |  5 ++++
 gap_statistic/__init__.py |  2 +-
 gap_statistic/optimalK.py | 20 +++++++-------
 tests/test_optimalK.py    | 35 ++++++++++++++++++++++---
 5 files changed, 102 insertions(+), 15 deletions(-)

diff --git a/Example.ipynb b/Example.ipynb
index 15b8e1a..2f23937 100644
--- a/Example.ipynb
+++ b/Example.ipynb
@@ -250,7 +250,58 @@
    "source": [
     "### Notes:\n",
     "\n",
-    "Please be aware that, as the image above hints to, number of clusters can be subjective. This is merely meant to provide a suggestion to the number of clusters in your data; the true amount can vary depending upon your specific objective. The clusters here can be interpreted as three, but also clearly just two. Also due to random initialization, the suggested n_clusters could also vary."
+    "Please be aware that, as the image above hints to, number of clusters can be subjective. This is merely meant to provide a suggestion to the number of clusters in your data; the true amount can vary depending upon your specific objective. The clusters here can be interpreted as three, but also clearly just two. Also due to random initialization, the suggested n_clusters could also vary.\n",
+    "\n",
+    "---\n",
+    "\n",
+    "### Use your own clustering algorithm! (Added in v1.6.0)\n",
+    "\n",
+    "As the default implementation is KMeans, but Gap Statistic allows using any clusting algorithm, you can define your own in the following example:\n",
+    "\n",
+    "\n",
+    "---\n",
+    "\n",
+    "#### First, define a function which takes `X` and `k`, it _must_ return a tuple of the centroid locations, and the labels assigned to `X`"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# We'll wrap the `MeanShift` algorithm from sklearn\n",
+    "\n",
+    "from sklearn.cluster import MeanShift\n",
+    "\n",
+    "def special_clustering_func(X, k):\n",
+    "    \"\"\" \n",
+    "    Special clustering function which uses the MeanShift\n",
+    "    model from sklearn.\n",
+    "    \n",
+    "    These user defined functions *must* take the X and a k \n",
+    "    and can take an arbitrary number of other kwargs, which can\n",
+    "    be pass with `clusterer_kwargs` when initializing OptimalK\n",
+    "    \"\"\"\n",
+    "    \n",
+    "    # Here you can do whatever clustering algorithm you heart desires,\n",
+    "    # but we'll do a simple wrap of the MeanShift model in sklearn.\n",
+    "    \n",
+    "    m = MeanShift()\n",
+    "    m.fit(X)\n",
+    "    \n",
+    "    # Return the location of each cluster center,\n",
+    "    # and the labels for each point.\n",
+    "    return m.cluster_centers_, m.predict(X)\n",
+    "\n",
+    "# Make some data\n",
+    "X, y = make_blobs(n_samples=50, n_features=2, centers=3, random_state=25)\n",
+    "\n",
+    "# Define the OptimalK instance, but pass in our own clustering function\n",
+    "optimalk = OptimalK(clusterer=special_clustering_func)\n",
+    "\n",
+    "# Use the callable instance as normal.\n",
+    "n_clusters = optimalk(X, n_refs=3, cluster_array=range(1, 4))"
    ]
   }
  ],
@@ -270,7 +321,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.5.5"
+   "version": "3.7.2"
   }
  },
  "nbformat": 4,
diff --git a/README.md b/README.md
index ab86166..9504dc9 100644
--- a/README.md
+++ b/README.md
@@ -45,6 +45,11 @@ pip uninstall gap-stat
 
 Change Log (Latest first):
 
+- 1.6
+    - May-2019
+    - Support user defined functions for the clustering algorithm used in the gap statistic process
+    - Migrate to Azure Pipelines CI
+
 - 1.5.2
     - August-2018
     - Fix calculation of gap statistic
diff --git a/gap_statistic/__init__.py b/gap_statistic/__init__.py
index 0d39e37..f5cf64f 100644
--- a/gap_statistic/__init__.py
+++ b/gap_statistic/__init__.py
@@ -1,2 +1,2 @@
 from gap_statistic.optimalK import OptimalK
-__version__ = '1.5.2'
+__version__ = '1.6.0'
diff --git a/gap_statistic/optimalK.py b/gap_statistic/optimalK.py
index f1ac9d0..19640a6 100644
--- a/gap_statistic/optimalK.py
+++ b/gap_statistic/optimalK.py
@@ -5,7 +5,7 @@
 import pandas as pd
 from concurrent.futures import ProcessPoolExecutor, as_completed
 from multiprocessing import cpu_count
-from typing import Union, Iterable, Tuple
+from typing import Union, Iterable, Tuple, Callable
 from scipy.cluster.vq import kmeans2
 
 try:
@@ -31,16 +31,21 @@ class OptimalK:
     """
     gap_df = None
 
-    def __init__(self, n_jobs: int=-1, parallel_backend: str='joblib') -> None:
+    def __init__(self, n_jobs: int=-1, parallel_backend: str='joblib', clusterer: Callable=None, clusterer_kwargs: dict=None) -> None:
         """
         Construct OptimalK to use n_jobs (multiprocessing using joblib, multiprocessing, or single core.
-        if parallel_backend == 'rust' (fastest) default is to use all cores.
+        if parallel_backend == 'rust' it will use all cores.
 
-        :param n_jobs - int: Number of CPU cores to use. Use all cores if n_jobs == -1 ignored if backend is 'rust'
+        :param n_jobs:
+        :param parallel_backend:
+        :param clusterer:
+        :param clusterer_kwargs:
         """
         self.parallel_backend = parallel_backend if parallel_backend in ['joblib', 'multiprocessing', 'rust'] else None
         self.n_jobs = n_jobs if 1 <= n_jobs <= cpu_count() else cpu_count()  # type: int
         self.n_jobs = 1 if parallel_backend is None else self.n_jobs
+        self.clusterer = clusterer if clusterer is not None else kmeans2
+        self.clusterer_kwargs = clusterer_kwargs or dict() if clusterer is not None else dict(iter=10, minit='points')
 
     def __call__(self, X: Union[pd.DataFrame, np.ndarray], n_refs: int=3, cluster_array: Iterable[int]=()):
         """
@@ -110,15 +115,12 @@ def _calculate_gap(self, X: Union[pd.DataFrame, np.ndarray], n_refs: int, n_clus
             random_data = np.random.random_sample(size=X.shape)  # type: np.ndarray
 
             # Fit to it, getting the centroids and labels, and add to accumulated reference dispersions array.
-            centroids, labels = kmeans2(data=random_data,
-                                        k=n_clusters,
-                                        iter=10,
-                                        minit='points')  # type: Tuple[np.ndarray, np.ndarray]
+            centroids, labels = self.clusterer(random_data, n_clusters, **self.clusterer_kwargs)  # type: Tuple[np.ndarray, np.ndarray]
             dispersion = self._calculate_dispersion(X=random_data, labels=labels, centroids=centroids)  # type: float
             ref_dispersions[i] = dispersion
 
         # Fit cluster to original data and create dispersion calc.
-        centroids, labels = kmeans2(data=X, k=n_clusters, iter=10, minit='points')
+        centroids, labels = self.clusterer(random_data, n_clusters, **self.clusterer_kwargs)  # type: Tuple[np.ndarray, np.ndarray]
         dispersion = self._calculate_dispersion(X=X, labels=labels, centroids=centroids)
 
         # Calculate gap statistic
diff --git a/tests/test_optimalK.py b/tests/test_optimalK.py
index d4c9631..e42a3b8 100644
--- a/tests/test_optimalK.py
+++ b/tests/test_optimalK.py
@@ -1,6 +1,38 @@
 # -*- coding: utf-8 -*-
 import pytest
 
+import numpy as np
+from sklearn.datasets.samples_generator import make_blobs
+from sklearn.cluster import SpectralClustering, KMeans, MeanShift
+
+from gap_statistic import OptimalK
+
+
+@pytest.mark.parametrize("ClusterModel", [KMeans, MeanShift])
+def test_alternative_clusting_method(ClusterModel):
+    """
+    Test that users can supply alternative clustering method as dep injection
+    """
+
+    def clusterer(X: np.ndarray, k: int, another_test_arg):
+        """
+        Function to wrap a sklearn model as a clusterer for OptimalK
+        First two arguments are always the data matrix, and k, and can supply
+        """
+        m = ClusterModel()
+        m.fit(X)
+        assert another_test_arg == 'test'
+        return m.cluster_centers_, m.predict(X)
+
+    optimalk = OptimalK(n_jobs=-1,
+                        parallel_backend='joblib',
+                        clusterer=clusterer,
+                        clusterer_kwargs={'another_test_arg': 'test'}
+                        )
+    X, y = make_blobs(n_samples=50, n_features=2, centers=3)
+    n_clusters = optimalk(X, n_refs=3, cluster_array=np.arange(1, 5))
+    assert isinstance(n_clusters, int)
+
 
 @pytest.mark.parametrize(
     "parallel_backend, n_jobs, n_clusters", [
@@ -13,9 +45,6 @@ def test_optimalk(parallel_backend, n_jobs, n_clusters):
     """
     Test core functionality of OptimalK using all backends.
     """
-    import numpy as np
-    from sklearn.datasets.samples_generator import make_blobs
-    from gap_statistic import OptimalK
 
     # Create optimalK instance
     optimalK = OptimalK(parallel_backend=parallel_backend, n_jobs=n_jobs)