Skip to content

Commit

Permalink
Add black formatting
Browse files Browse the repository at this point in the history
  • Loading branch information
milesgranger committed May 19, 2019
1 parent fe1b1d3 commit bdb4af1
Show file tree
Hide file tree
Showing 6 changed files with 212 additions and 95 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
[![Downloads](http://pepy.tech/badge/gap-stat)](http://pepy.tech/project/gap-stat)
[![Coverage Status](https://coveralls.io/repos/github/milesgranger/gap_statistic/badge.svg)](https://coveralls.io/github/milesgranger/gap_statistic)
[![Code Health](https://landscape.io/github/milesgranger/gap_statistic/master/landscape.svg?style=flat)](https://landscape.io/github/milesgranger/gap_statistic/master)
[![Code Style](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/python/black)


[![Anaconda](https://anaconda.org/milesgranger/gap-stat/badges/version.svg)](https://anaconda.org/milesgranger/gap-stat)
Expand Down
3 changes: 2 additions & 1 deletion gap_statistic/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
from gap_statistic.optimalK import OptimalK
__version__ = '1.6.0'

__version__ = "1.6.0"
139 changes: 100 additions & 39 deletions gap_statistic/optimalK.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,9 @@
from joblib import Parallel, delayed
except ImportError:
Parallel, delayed = None, None
warnings.warn('joblib not installed, will be unavailable as a backend for parallel processing.')
warnings.warn(
"joblib not installed, will be unavailable as a backend for parallel processing."
)


class OptimalK:
Expand All @@ -29,9 +31,16 @@ class OptimalK:
>>> optimalK(X, cluster_array=[1,2,3,4,5])
3
"""

gap_df = None

def __init__(self, n_jobs: int=-1, parallel_backend: str='joblib', clusterer: Callable=None, clusterer_kwargs: dict=None) -> None:
def __init__(
self,
n_jobs: int = -1,
parallel_backend: str = "joblib",
clusterer: Callable = None,
clusterer_kwargs: dict = None,
) -> None:
"""
Construct OptimalK to use n_jobs (multiprocessing using joblib, multiprocessing, or single core.
if parallel_backend == 'rust' it will use all cores.
Expand All @@ -41,13 +50,26 @@ def __init__(self, n_jobs: int=-1, parallel_backend: str='joblib', clusterer: Ca
:param clusterer:
:param clusterer_kwargs:
"""
self.parallel_backend = parallel_backend if parallel_backend in ['joblib', 'multiprocessing', 'rust'] else None
self.parallel_backend = (
parallel_backend
if parallel_backend in ["joblib", "multiprocessing", "rust"]
else None
)
self.n_jobs = n_jobs if 1 <= n_jobs <= cpu_count() else cpu_count() # type: int
self.n_jobs = 1 if parallel_backend is None else self.n_jobs
self.clusterer = clusterer if clusterer is not None else kmeans2
self.clusterer_kwargs = clusterer_kwargs or dict() if clusterer is not None else dict(iter=10, minit='points')

def __call__(self, X: Union[pd.DataFrame, np.ndarray], n_refs: int=3, cluster_array: Iterable[int]=()):
self.clusterer_kwargs = (
clusterer_kwargs or dict()
if clusterer is not None
else dict(iter=10, minit="points")
)

def __call__(
self,
X: Union[pd.DataFrame, np.ndarray],
n_refs: int = 3,
cluster_array: Iterable[int] = (),
):
"""
Calculates KMeans optimal K using Gap Statistic from Tibshirani, Walther, Hastie
http://www.web.stanford.edu/~hastie/Papers/gap.pdf
Expand All @@ -60,25 +82,29 @@ def __call__(self, X: Union[pd.DataFrame, np.ndarray], n_refs: int=3, cluster_ar
# Raise error if values are less than 1 or larger than the unique sample in the set.
cluster_array = np.array([x for x in cluster_array]).astype(int)
if np.where(cluster_array < 1)[0].shape[0]:
raise ValueError('cluster_array contains values less than 1: {}'
.format(cluster_array[np.where(cluster_array < 1)[0]])
)
raise ValueError(
"cluster_array contains values less than 1: {}".format(
cluster_array[np.where(cluster_array < 1)[0]]
)
)
if cluster_array.shape[0] > X.shape[0]:
raise ValueError('The number of suggested clusters to try ({}) is larger than samples in dataset. ({})'
.format(cluster_array.shape[0], X.shape[0])
)
raise ValueError(
"The number of suggested clusters to try ({}) is larger than samples in dataset. ({})".format(
cluster_array.shape[0], X.shape[0]
)
)
if not cluster_array.shape[0]:
raise ValueError('The supplied cluster_array has no values.')
raise ValueError("The supplied cluster_array has no values.")

# Array of resulting gaps.
gap_df = pd.DataFrame({'n_clusters': [], 'gap_value': []})
gap_df = pd.DataFrame({"n_clusters": [], "gap_value": []})

# Define the compute engine; all methods take identical args and are generators.
if self.parallel_backend == 'joblib':
if self.parallel_backend == "joblib":
engine = self._process_with_joblib
elif self.parallel_backend == 'multiprocessing':
elif self.parallel_backend == "multiprocessing":
engine = self._process_with_multiprocessing
elif self.parallel_backend == 'rust':
elif self.parallel_backend == "rust":
engine = self._process_with_rust
else:
engine = self._process_non_parallel
Expand All @@ -87,20 +113,32 @@ def __call__(self, X: Union[pd.DataFrame, np.ndarray], n_refs: int=3, cluster_ar
for (gap_value, n_clusters) in engine(X, n_refs, cluster_array):

# Assign this loop's gap statistic to gaps
gap_df = gap_df.append({'n_clusters': n_clusters, 'gap_value': gap_value}, ignore_index=True)
gap_df = gap_df.append(
{"n_clusters": n_clusters, "gap_value": gap_value}, ignore_index=True
)

self.gap_df = gap_df.sort_values(by='n_clusters', ascending=True).reset_index(drop=True)
self.gap_df = gap_df.sort_values(by="n_clusters", ascending=True).reset_index(
drop=True
)
return int(self.gap_df.loc[np.argmax(self.gap_df.gap_value.values)].n_clusters)

@staticmethod
def _calculate_dispersion(X: Union[pd.DataFrame, np.ndarray], labels: np.ndarray, centroids: np.ndarray) -> float:
def _calculate_dispersion(
X: Union[pd.DataFrame, np.ndarray], labels: np.ndarray, centroids: np.ndarray
) -> float:
"""
Calculate the dispersion between actual points and their assigned centroids
"""
disp = np.sum(np.sum([np.abs(inst - centroids[label]) ** 2 for inst, label in zip(X, labels)])) # type: float
disp = np.sum(
np.sum(
[np.abs(inst - centroids[label]) ** 2 for inst, label in zip(X, labels)]
)
) # type: float
return disp

def _calculate_gap(self, X: Union[pd.DataFrame, np.ndarray], n_refs: int, n_clusters: int) -> Tuple[float, int]:
def _calculate_gap(
self, X: Union[pd.DataFrame, np.ndarray], n_refs: int, n_clusters: int
) -> Tuple[float, int]:
"""
Calculate the gap value of the given data, n_refs, and number of clusters.
Return the resutling gap value and n_clusters
Expand All @@ -115,66 +153,89 @@ def _calculate_gap(self, X: Union[pd.DataFrame, np.ndarray], n_refs: int, n_clus
random_data = np.random.random_sample(size=X.shape) # type: np.ndarray

# Fit to it, getting the centroids and labels, and add to accumulated reference dispersions array.
centroids, labels = self.clusterer(random_data, n_clusters, **self.clusterer_kwargs) # type: Tuple[np.ndarray, np.ndarray]
dispersion = self._calculate_dispersion(X=random_data, labels=labels, centroids=centroids) # type: float
centroids, labels = self.clusterer(
random_data, n_clusters, **self.clusterer_kwargs
) # type: Tuple[np.ndarray, np.ndarray]
dispersion = self._calculate_dispersion(
X=random_data, labels=labels, centroids=centroids
) # type: float
ref_dispersions[i] = dispersion

# Fit cluster to original data and create dispersion calc.
centroids, labels = self.clusterer(random_data, n_clusters, **self.clusterer_kwargs) # type: Tuple[np.ndarray, np.ndarray]
centroids, labels = self.clusterer(
random_data, n_clusters, **self.clusterer_kwargs
) # type: Tuple[np.ndarray, np.ndarray]
dispersion = self._calculate_dispersion(X=X, labels=labels, centroids=centroids)

# Calculate gap statistic
gap_value = np.mean(np.log(ref_dispersions)) - np.log(dispersion)

return gap_value, int(n_clusters)

def _process_with_rust(self, X: Union[pd.DataFrame, np.ndarray], n_refs: int, cluster_array: np.ndarray):
def _process_with_rust(
self, X: Union[pd.DataFrame, np.ndarray], n_refs: int, cluster_array: np.ndarray
):
"""
Process gap stat using pure rust
"""
from gap_statistic.rust import gapstat

for label, gap_value in gapstat.optimal_k(X, list(cluster_array)):
yield (gap_value, label)

def _process_with_joblib(self, X: Union[pd.DataFrame, np.ndarray], n_refs: int, cluster_array: np.ndarray):
def _process_with_joblib(
self, X: Union[pd.DataFrame, np.ndarray], n_refs: int, cluster_array: np.ndarray
):
"""
Process calling of .calculate_gap() method using the joblib backend
"""
if Parallel is None:
raise EnvironmentError('joblib is not installed; cannot use joblib as the parallel backend!')
raise EnvironmentError(
"joblib is not installed; cannot use joblib as the parallel backend!"
)

with Parallel(n_jobs=self.n_jobs) as parallel:
for gap_value, n_clusters in parallel(delayed(self._calculate_gap)(X, n_refs, n_clusters)
for n_clusters in cluster_array):
for gap_value, n_clusters in parallel(
delayed(self._calculate_gap)(X, n_refs, n_clusters)
for n_clusters in cluster_array
):
yield (gap_value, n_clusters)

def _process_with_multiprocessing(self, X: Union[pd.DataFrame, np.ndarray], n_refs: int, cluster_array: np.ndarray):
def _process_with_multiprocessing(
self, X: Union[pd.DataFrame, np.ndarray], n_refs: int, cluster_array: np.ndarray
):
"""
Process calling of .calculate_gap() method using the multiprocessing library
"""
with ProcessPoolExecutor(max_workers=self.n_jobs) as executor:

jobs = [executor.submit(self._calculate_gap, X, n_refs, n_clusters)
for n_clusters in cluster_array
]
jobs = [
executor.submit(self._calculate_gap, X, n_refs, n_clusters)
for n_clusters in cluster_array
]

for future in as_completed(jobs):
gap_value, k = future.result()
yield (gap_value, k)

def _process_non_parallel(self, X: Union[pd.DataFrame, np.ndarray], n_refs: int, cluster_array: np.ndarray):
def _process_non_parallel(
self, X: Union[pd.DataFrame, np.ndarray], n_refs: int, cluster_array: np.ndarray
):
"""
Process calling of .calculate_gap() method using no parallel backend; simple for loop generator
"""
for gap_value, n_clusters in [self._calculate_gap(X, n_refs, n_clusters)
for n_clusters in cluster_array]:
for gap_value, n_clusters in [
self._calculate_gap(X, n_refs, n_clusters) for n_clusters in cluster_array
]:
yield (gap_value, n_clusters)

def __str__(self):
return 'OptimalK(n_jobs={}, parallel_backend="{}")'.format(self.n_jobs, self.parallel_backend)
return 'OptimalK(n_jobs={}, parallel_backend="{}")'.format(
self.n_jobs, self.parallel_backend
)

def __repr__(self):
return self.__str__()

def _repr_html_(self):
return '<p>{}</p>'.format(self.__str__())
return "<p>{}</p>".format(self.__str__())
94 changes: 55 additions & 39 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,49 +6,65 @@
from setuptools_rust import RustExtension, Binding
except ImportError:
import subprocess
errno = subprocess.call([sys.executable, '-m', 'pip', 'install', 'setuptools-rust>=0.9.2'])

errno = subprocess.call(
[sys.executable, "-m", "pip", "install", "setuptools-rust>=0.9.2"]
)
if errno:
print("Please install the 'setuptools-rust>=0.9.2' package")
raise SystemExit(errno)
else:
from setuptools_rust import RustExtension, Binding

install_requires = ["numpy", "pandas", "scipy"]

setup_requires = [
"setuptools-rust>=0.9.2",
"pytest-runner",
"pytest",
"scikit-learn",
"joblib",
"scipy",
"pandas",
]
tests_require = ["scikit-learn", "pytest", "joblib", "black", "click"]

setup(name='gap-stat',
version=__version__,
author='Miles Granger',
maintainer='Miles Granger',
author_email='[email protected]',
maintainer_email='[email protected]',
keywords='kmeans unsupervised learning machine-learning clustering',
description='Python implementation of the gap statistic with Rust optimizations.',
long_description='Uses the gap statistic method by Tibshirani, Walther, Hastie to suggest n_clusters.',
packages=['gap_statistic'],
rust_extensions=[
RustExtension('gap_statistic.rust.gapstat', 'Cargo.toml', binding=Binding.PyO3)
],
license='BSD',
url='https://github.com/milesgranger/gap_statistic',
zip_safe=False,
setup_requires=['setuptools-rust>=0.9.2', 'pytest-runner', 'pytest', 'scikit-learn', 'joblib', 'scipy', 'pandas'],
install_requires=['numpy', 'pandas', 'scipy'],
tests_require=['scikit-learn', 'pytest', 'joblib'],
test_suite='tests',
include_package_data=True,
classifiers=[
'Development Status :: 5 - Production/Stable',
'Intended Audience :: Developers',
'Intended Audience :: Financial and Insurance Industry',
'Intended Audience :: Information Technology',
'Intended Audience :: Science/Research',
'Programming Language :: Python',
'Programming Language :: Python :: 3.5',
'Programming Language :: Python :: 3.6',
'Programming Language :: Python :: 3 :: Only',
'Programming Language :: Rust',
'Operating System :: Microsoft :: Windows',
'Operating System :: POSIX',
'Operating System :: Unix',
'Operating System :: MacOS :: MacOS X',
],
)
setup(
name="gap-stat",
version=__version__,
author="Miles Granger",
maintainer="Miles Granger",
author_email="[email protected]",
maintainer_email="[email protected]",
keywords="kmeans unsupervised learning machine-learning clustering",
description="Python implementation of the gap statistic with Rust optimizations.",
long_description="Uses the gap statistic method by Tibshirani, Walther, Hastie to suggest n_clusters.",
packages=["gap_statistic"],
rust_extensions=[
RustExtension("gap_statistic.rust.gapstat", "Cargo.toml", binding=Binding.PyO3)
],
license="BSD",
url="https://github.com/milesgranger/gap_statistic",
zip_safe=False,
setup_requires=setup_requires,
install_requires=install_requires,
tests_require=tests_require,
test_suite="tests",
include_package_data=True,
classifiers=[
"Development Status :: 5 - Production/Stable",
"Intended Audience :: Developers",
"Intended Audience :: Financial and Insurance Industry",
"Intended Audience :: Information Technology",
"Intended Audience :: Science/Research",
"Programming Language :: Python",
"Programming Language :: Python :: 3.5",
"Programming Language :: Python :: 3.6",
"Programming Language :: Python :: 3 :: Only",
"Programming Language :: Rust",
"Operating System :: Microsoft :: Windows",
"Operating System :: POSIX",
"Operating System :: Unix",
"Operating System :: MacOS :: MacOS X",
],
)
27 changes: 27 additions & 0 deletions tests/test_formatting.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
import os

import pytest


@pytest.mark.skipif(
os.environ.get("AGENT_OS") == "Windows_NT",
reason="Black formatting fails on Azure Windows builds.",
)
def test_formatting():
"""
Ensure project adheres to black style
"""
import black
from click.testing import CliRunner

print(os.environ)

proj_path = os.path.join(os.path.dirname(__file__), "..", "gap_statistic")
tests_path = os.path.join(os.path.dirname(__file__), "..", "tests")
setuppy = os.path.join(os.path.dirname(__file__), "..", "setup.py")

runner = CliRunner()
resp = runner.invoke(black.main, ["--check", "-v", proj_path, tests_path, setuppy])
assert resp.exit_code == 0, "Would still reformat one or more files:\n{}".format(
resp.output
)
Loading

0 comments on commit bdb4af1

Please sign in to comment.