From fe1b1d31c93c592eb597474493f520b7f6b6434c Mon Sep 17 00:00:00 2001 From: Miles Granger Date: Thu, 16 May 2019 12:43:32 +0200 Subject: [PATCH] Support user defined clustering function Also update Example.ipynb with an example --- Example.ipynb | 55 +++++++++++++++++++++++++++++++++++++-- README.md | 5 ++++ gap_statistic/__init__.py | 2 +- gap_statistic/optimalK.py | 20 +++++++------- tests/test_optimalK.py | 35 ++++++++++++++++++++++--- 5 files changed, 102 insertions(+), 15 deletions(-) diff --git a/Example.ipynb b/Example.ipynb index 15b8e1a..2f23937 100644 --- a/Example.ipynb +++ b/Example.ipynb @@ -250,7 +250,58 @@ "source": [ "### Notes:\n", "\n", - "Please be aware that, as the image above hints to, number of clusters can be subjective. This is merely meant to provide a suggestion to the number of clusters in your data; the true amount can vary depending upon your specific objective. The clusters here can be interpreted as three, but also clearly just two. Also due to random initialization, the suggested n_clusters could also vary." + "Please be aware that, as the image above hints to, number of clusters can be subjective. This is merely meant to provide a suggestion to the number of clusters in your data; the true amount can vary depending upon your specific objective. The clusters here can be interpreted as three, but also clearly just two. Also due to random initialization, the suggested n_clusters could also vary.\n", + "\n", + "---\n", + "\n", + "### Use your own clustering algorithm! (Added in v1.6.0)\n", + "\n", + "As the default implementation is KMeans, but Gap Statistic allows using any clusting algorithm, you can define your own in the following example:\n", + "\n", + "\n", + "---\n", + "\n", + "#### First, define a function which takes `X` and `k`, it _must_ return a tuple of the centroid locations, and the labels assigned to `X`" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "# We'll wrap the `MeanShift` algorithm from sklearn\n", + "\n", + "from sklearn.cluster import MeanShift\n", + "\n", + "def special_clustering_func(X, k):\n", + " \"\"\" \n", + " Special clustering function which uses the MeanShift\n", + " model from sklearn.\n", + " \n", + " These user defined functions *must* take the X and a k \n", + " and can take an arbitrary number of other kwargs, which can\n", + " be pass with `clusterer_kwargs` when initializing OptimalK\n", + " \"\"\"\n", + " \n", + " # Here you can do whatever clustering algorithm you heart desires,\n", + " # but we'll do a simple wrap of the MeanShift model in sklearn.\n", + " \n", + " m = MeanShift()\n", + " m.fit(X)\n", + " \n", + " # Return the location of each cluster center,\n", + " # and the labels for each point.\n", + " return m.cluster_centers_, m.predict(X)\n", + "\n", + "# Make some data\n", + "X, y = make_blobs(n_samples=50, n_features=2, centers=3, random_state=25)\n", + "\n", + "# Define the OptimalK instance, but pass in our own clustering function\n", + "optimalk = OptimalK(clusterer=special_clustering_func)\n", + "\n", + "# Use the callable instance as normal.\n", + "n_clusters = optimalk(X, n_refs=3, cluster_array=range(1, 4))" ] } ], @@ -270,7 +321,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.5.5" + "version": "3.7.2" } }, "nbformat": 4, diff --git a/README.md b/README.md index ab86166..9504dc9 100644 --- a/README.md +++ b/README.md @@ -45,6 +45,11 @@ pip uninstall gap-stat Change Log (Latest first): +- 1.6 + - May-2019 + - Support user defined functions for the clustering algorithm used in the gap statistic process + - Migrate to Azure Pipelines CI + - 1.5.2 - August-2018 - Fix calculation of gap statistic diff --git a/gap_statistic/__init__.py b/gap_statistic/__init__.py index 0d39e37..f5cf64f 100644 --- a/gap_statistic/__init__.py +++ b/gap_statistic/__init__.py @@ -1,2 +1,2 @@ from gap_statistic.optimalK import OptimalK -__version__ = '1.5.2' +__version__ = '1.6.0' diff --git a/gap_statistic/optimalK.py b/gap_statistic/optimalK.py index f1ac9d0..19640a6 100644 --- a/gap_statistic/optimalK.py +++ b/gap_statistic/optimalK.py @@ -5,7 +5,7 @@ import pandas as pd from concurrent.futures import ProcessPoolExecutor, as_completed from multiprocessing import cpu_count -from typing import Union, Iterable, Tuple +from typing import Union, Iterable, Tuple, Callable from scipy.cluster.vq import kmeans2 try: @@ -31,16 +31,21 @@ class OptimalK: """ gap_df = None - def __init__(self, n_jobs: int=-1, parallel_backend: str='joblib') -> None: + def __init__(self, n_jobs: int=-1, parallel_backend: str='joblib', clusterer: Callable=None, clusterer_kwargs: dict=None) -> None: """ Construct OptimalK to use n_jobs (multiprocessing using joblib, multiprocessing, or single core. - if parallel_backend == 'rust' (fastest) default is to use all cores. + if parallel_backend == 'rust' it will use all cores. - :param n_jobs - int: Number of CPU cores to use. Use all cores if n_jobs == -1 ignored if backend is 'rust' + :param n_jobs: + :param parallel_backend: + :param clusterer: + :param clusterer_kwargs: """ self.parallel_backend = parallel_backend if parallel_backend in ['joblib', 'multiprocessing', 'rust'] else None self.n_jobs = n_jobs if 1 <= n_jobs <= cpu_count() else cpu_count() # type: int self.n_jobs = 1 if parallel_backend is None else self.n_jobs + self.clusterer = clusterer if clusterer is not None else kmeans2 + self.clusterer_kwargs = clusterer_kwargs or dict() if clusterer is not None else dict(iter=10, minit='points') def __call__(self, X: Union[pd.DataFrame, np.ndarray], n_refs: int=3, cluster_array: Iterable[int]=()): """ @@ -110,15 +115,12 @@ def _calculate_gap(self, X: Union[pd.DataFrame, np.ndarray], n_refs: int, n_clus random_data = np.random.random_sample(size=X.shape) # type: np.ndarray # Fit to it, getting the centroids and labels, and add to accumulated reference dispersions array. - centroids, labels = kmeans2(data=random_data, - k=n_clusters, - iter=10, - minit='points') # type: Tuple[np.ndarray, np.ndarray] + centroids, labels = self.clusterer(random_data, n_clusters, **self.clusterer_kwargs) # type: Tuple[np.ndarray, np.ndarray] dispersion = self._calculate_dispersion(X=random_data, labels=labels, centroids=centroids) # type: float ref_dispersions[i] = dispersion # Fit cluster to original data and create dispersion calc. - centroids, labels = kmeans2(data=X, k=n_clusters, iter=10, minit='points') + centroids, labels = self.clusterer(random_data, n_clusters, **self.clusterer_kwargs) # type: Tuple[np.ndarray, np.ndarray] dispersion = self._calculate_dispersion(X=X, labels=labels, centroids=centroids) # Calculate gap statistic diff --git a/tests/test_optimalK.py b/tests/test_optimalK.py index d4c9631..e42a3b8 100644 --- a/tests/test_optimalK.py +++ b/tests/test_optimalK.py @@ -1,6 +1,38 @@ # -*- coding: utf-8 -*- import pytest +import numpy as np +from sklearn.datasets.samples_generator import make_blobs +from sklearn.cluster import SpectralClustering, KMeans, MeanShift + +from gap_statistic import OptimalK + + +@pytest.mark.parametrize("ClusterModel", [KMeans, MeanShift]) +def test_alternative_clusting_method(ClusterModel): + """ + Test that users can supply alternative clustering method as dep injection + """ + + def clusterer(X: np.ndarray, k: int, another_test_arg): + """ + Function to wrap a sklearn model as a clusterer for OptimalK + First two arguments are always the data matrix, and k, and can supply + """ + m = ClusterModel() + m.fit(X) + assert another_test_arg == 'test' + return m.cluster_centers_, m.predict(X) + + optimalk = OptimalK(n_jobs=-1, + parallel_backend='joblib', + clusterer=clusterer, + clusterer_kwargs={'another_test_arg': 'test'} + ) + X, y = make_blobs(n_samples=50, n_features=2, centers=3) + n_clusters = optimalk(X, n_refs=3, cluster_array=np.arange(1, 5)) + assert isinstance(n_clusters, int) + @pytest.mark.parametrize( "parallel_backend, n_jobs, n_clusters", [ @@ -13,9 +45,6 @@ def test_optimalk(parallel_backend, n_jobs, n_clusters): """ Test core functionality of OptimalK using all backends. """ - import numpy as np - from sklearn.datasets.samples_generator import make_blobs - from gap_statistic import OptimalK # Create optimalK instance optimalK = OptimalK(parallel_backend=parallel_backend, n_jobs=n_jobs)