Skip to content

Commit

Permalink
Support user defined clustering function
Browse files Browse the repository at this point in the history
Also update Example.ipynb with an example
  • Loading branch information
milesgranger committed May 19, 2019
1 parent db030ee commit fe1b1d3
Show file tree
Hide file tree
Showing 5 changed files with 102 additions and 15 deletions.
55 changes: 53 additions & 2 deletions Example.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -250,7 +250,58 @@
"source": [
"### Notes:\n",
"\n",
"Please be aware that, as the image above hints to, number of clusters can be subjective. This is merely meant to provide a suggestion to the number of clusters in your data; the true amount can vary depending upon your specific objective. The clusters here can be interpreted as three, but also clearly just two. Also due to random initialization, the suggested n_clusters could also vary."
"Please be aware that, as the image above hints to, number of clusters can be subjective. This is merely meant to provide a suggestion to the number of clusters in your data; the true amount can vary depending upon your specific objective. The clusters here can be interpreted as three, but also clearly just two. Also due to random initialization, the suggested n_clusters could also vary.\n",
"\n",
"---\n",
"\n",
"### Use your own clustering algorithm! (Added in v1.6.0)\n",
"\n",
"As the default implementation is KMeans, but Gap Statistic allows using any clusting algorithm, you can define your own in the following example:\n",
"\n",
"\n",
"---\n",
"\n",
"#### First, define a function which takes `X` and `k`, it _must_ return a tuple of the centroid locations, and the labels assigned to `X`"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"# We'll wrap the `MeanShift` algorithm from sklearn\n",
"\n",
"from sklearn.cluster import MeanShift\n",
"\n",
"def special_clustering_func(X, k):\n",
" \"\"\" \n",
" Special clustering function which uses the MeanShift\n",
" model from sklearn.\n",
" \n",
" These user defined functions *must* take the X and a k \n",
" and can take an arbitrary number of other kwargs, which can\n",
" be pass with `clusterer_kwargs` when initializing OptimalK\n",
" \"\"\"\n",
" \n",
" # Here you can do whatever clustering algorithm you heart desires,\n",
" # but we'll do a simple wrap of the MeanShift model in sklearn.\n",
" \n",
" m = MeanShift()\n",
" m.fit(X)\n",
" \n",
" # Return the location of each cluster center,\n",
" # and the labels for each point.\n",
" return m.cluster_centers_, m.predict(X)\n",
"\n",
"# Make some data\n",
"X, y = make_blobs(n_samples=50, n_features=2, centers=3, random_state=25)\n",
"\n",
"# Define the OptimalK instance, but pass in our own clustering function\n",
"optimalk = OptimalK(clusterer=special_clustering_func)\n",
"\n",
"# Use the callable instance as normal.\n",
"n_clusters = optimalk(X, n_refs=3, cluster_array=range(1, 4))"
]
}
],
Expand All @@ -270,7 +321,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.5"
"version": "3.7.2"
}
},
"nbformat": 4,
Expand Down
5 changes: 5 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,11 @@ pip uninstall gap-stat

Change Log (Latest first):

- 1.6
- May-2019
- Support user defined functions for the clustering algorithm used in the gap statistic process
- Migrate to Azure Pipelines CI

- 1.5.2
- August-2018
- Fix calculation of gap statistic
Expand Down
2 changes: 1 addition & 1 deletion gap_statistic/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
from gap_statistic.optimalK import OptimalK
__version__ = '1.5.2'
__version__ = '1.6.0'
20 changes: 11 additions & 9 deletions gap_statistic/optimalK.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import pandas as pd
from concurrent.futures import ProcessPoolExecutor, as_completed
from multiprocessing import cpu_count
from typing import Union, Iterable, Tuple
from typing import Union, Iterable, Tuple, Callable
from scipy.cluster.vq import kmeans2

try:
Expand All @@ -31,16 +31,21 @@ class OptimalK:
"""
gap_df = None

def __init__(self, n_jobs: int=-1, parallel_backend: str='joblib') -> None:
def __init__(self, n_jobs: int=-1, parallel_backend: str='joblib', clusterer: Callable=None, clusterer_kwargs: dict=None) -> None:
"""
Construct OptimalK to use n_jobs (multiprocessing using joblib, multiprocessing, or single core.
if parallel_backend == 'rust' (fastest) default is to use all cores.
if parallel_backend == 'rust' it will use all cores.
:param n_jobs - int: Number of CPU cores to use. Use all cores if n_jobs == -1 ignored if backend is 'rust'
:param n_jobs:
:param parallel_backend:
:param clusterer:
:param clusterer_kwargs:
"""
self.parallel_backend = parallel_backend if parallel_backend in ['joblib', 'multiprocessing', 'rust'] else None
self.n_jobs = n_jobs if 1 <= n_jobs <= cpu_count() else cpu_count() # type: int
self.n_jobs = 1 if parallel_backend is None else self.n_jobs
self.clusterer = clusterer if clusterer is not None else kmeans2
self.clusterer_kwargs = clusterer_kwargs or dict() if clusterer is not None else dict(iter=10, minit='points')

def __call__(self, X: Union[pd.DataFrame, np.ndarray], n_refs: int=3, cluster_array: Iterable[int]=()):
"""
Expand Down Expand Up @@ -110,15 +115,12 @@ def _calculate_gap(self, X: Union[pd.DataFrame, np.ndarray], n_refs: int, n_clus
random_data = np.random.random_sample(size=X.shape) # type: np.ndarray

# Fit to it, getting the centroids and labels, and add to accumulated reference dispersions array.
centroids, labels = kmeans2(data=random_data,
k=n_clusters,
iter=10,
minit='points') # type: Tuple[np.ndarray, np.ndarray]
centroids, labels = self.clusterer(random_data, n_clusters, **self.clusterer_kwargs) # type: Tuple[np.ndarray, np.ndarray]
dispersion = self._calculate_dispersion(X=random_data, labels=labels, centroids=centroids) # type: float
ref_dispersions[i] = dispersion

# Fit cluster to original data and create dispersion calc.
centroids, labels = kmeans2(data=X, k=n_clusters, iter=10, minit='points')
centroids, labels = self.clusterer(random_data, n_clusters, **self.clusterer_kwargs) # type: Tuple[np.ndarray, np.ndarray]
dispersion = self._calculate_dispersion(X=X, labels=labels, centroids=centroids)

# Calculate gap statistic
Expand Down
35 changes: 32 additions & 3 deletions tests/test_optimalK.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,38 @@
# -*- coding: utf-8 -*-
import pytest

import numpy as np
from sklearn.datasets.samples_generator import make_blobs
from sklearn.cluster import SpectralClustering, KMeans, MeanShift

from gap_statistic import OptimalK


@pytest.mark.parametrize("ClusterModel", [KMeans, MeanShift])
def test_alternative_clusting_method(ClusterModel):
"""
Test that users can supply alternative clustering method as dep injection
"""

def clusterer(X: np.ndarray, k: int, another_test_arg):
"""
Function to wrap a sklearn model as a clusterer for OptimalK
First two arguments are always the data matrix, and k, and can supply
"""
m = ClusterModel()
m.fit(X)
assert another_test_arg == 'test'
return m.cluster_centers_, m.predict(X)

optimalk = OptimalK(n_jobs=-1,
parallel_backend='joblib',
clusterer=clusterer,
clusterer_kwargs={'another_test_arg': 'test'}
)
X, y = make_blobs(n_samples=50, n_features=2, centers=3)
n_clusters = optimalk(X, n_refs=3, cluster_array=np.arange(1, 5))
assert isinstance(n_clusters, int)


@pytest.mark.parametrize(
"parallel_backend, n_jobs, n_clusters", [
Expand All @@ -13,9 +45,6 @@ def test_optimalk(parallel_backend, n_jobs, n_clusters):
"""
Test core functionality of OptimalK using all backends.
"""
import numpy as np
from sklearn.datasets.samples_generator import make_blobs
from gap_statistic import OptimalK

# Create optimalK instance
optimalK = OptimalK(parallel_backend=parallel_backend, n_jobs=n_jobs)
Expand Down

0 comments on commit fe1b1d3

Please sign in to comment.