Add black formatting

milesgranger · May 19, 2019 · bdb4af1 · bdb4af1
1 parent fe1b1d3
commit bdb4af1
Show file tree

Hide file tree

Showing 6 changed files with 212 additions and 95 deletions.
diff --git a/README.md b/README.md
@@ -6,6 +6,7 @@
 [![Downloads](http://pepy.tech/badge/gap-stat)](http://pepy.tech/project/gap-stat)
 [![Coverage Status](https://coveralls.io/repos/github/milesgranger/gap_statistic/badge.svg)](https://coveralls.io/github/milesgranger/gap_statistic)
 [![Code Health](https://landscape.io/github/milesgranger/gap_statistic/master/landscape.svg?style=flat)](https://landscape.io/github/milesgranger/gap_statistic/master)
+[![Code Style](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/python/black)
 
 
 [![Anaconda](https://anaconda.org/milesgranger/gap-stat/badges/version.svg)](https://anaconda.org/milesgranger/gap-stat)

diff --git a/gap_statistic/__init__.py b/gap_statistic/__init__.py
@@ -1,2 +1,3 @@
 from gap_statistic.optimalK import OptimalK
-__version__ = '1.6.0'
+
+__version__ = "1.6.0"
diff --git a/gap_statistic/optimalK.py b/gap_statistic/optimalK.py
@@ -12,7 +12,9 @@
     from joblib import Parallel, delayed
 except ImportError:
     Parallel, delayed = None, None
-    warnings.warn('joblib not installed, will be unavailable as a backend for parallel processing.')
+    warnings.warn(
+        "joblib not installed, will be unavailable as a backend for parallel processing."
+    )
 
 
 class OptimalK:
@@ -29,9 +31,16 @@ class OptimalK:
     >>> optimalK(X, cluster_array=[1,2,3,4,5])
     3
     """
+
     gap_df = None
 
-    def __init__(self, n_jobs: int=-1, parallel_backend: str='joblib', clusterer: Callable=None, clusterer_kwargs: dict=None) -> None:
+    def __init__(
+        self,
+        n_jobs: int = -1,
+        parallel_backend: str = "joblib",
+        clusterer: Callable = None,
+        clusterer_kwargs: dict = None,
+    ) -> None:
         """
         Construct OptimalK to use n_jobs (multiprocessing using joblib, multiprocessing, or single core.
         if parallel_backend == 'rust' it will use all cores.
@@ -41,13 +50,26 @@ def __init__(self, n_jobs: int=-1, parallel_backend: str='joblib', clusterer: Ca
         :param clusterer:
         :param clusterer_kwargs:
         """
-        self.parallel_backend = parallel_backend if parallel_backend in ['joblib', 'multiprocessing', 'rust'] else None
+        self.parallel_backend = (
+            parallel_backend
+            if parallel_backend in ["joblib", "multiprocessing", "rust"]
+            else None
+        )
         self.n_jobs = n_jobs if 1 <= n_jobs <= cpu_count() else cpu_count()  # type: int
         self.n_jobs = 1 if parallel_backend is None else self.n_jobs
         self.clusterer = clusterer if clusterer is not None else kmeans2
-        self.clusterer_kwargs = clusterer_kwargs or dict() if clusterer is not None else dict(iter=10, minit='points')
-
-    def __call__(self, X: Union[pd.DataFrame, np.ndarray], n_refs: int=3, cluster_array: Iterable[int]=()):
+        self.clusterer_kwargs = (
+            clusterer_kwargs or dict()
+            if clusterer is not None
+            else dict(iter=10, minit="points")
+        )
+
+    def __call__(
+        self,
+        X: Union[pd.DataFrame, np.ndarray],
+        n_refs: int = 3,
+        cluster_array: Iterable[int] = (),
+    ):
         """
         Calculates KMeans optimal K using Gap Statistic from Tibshirani, Walther, Hastie
         http://www.web.stanford.edu/~hastie/Papers/gap.pdf
@@ -60,25 +82,29 @@ def __call__(self, X: Union[pd.DataFrame, np.ndarray], n_refs: int=3, cluster_ar
         # Raise error if values are less than 1 or larger than the unique sample in the set.
         cluster_array = np.array([x for x in cluster_array]).astype(int)
         if np.where(cluster_array < 1)[0].shape[0]:
-            raise ValueError('cluster_array contains values less than 1: {}'
-                             .format(cluster_array[np.where(cluster_array < 1)[0]])
-                             )
+            raise ValueError(
+                "cluster_array contains values less than 1: {}".format(
+                    cluster_array[np.where(cluster_array < 1)[0]]
+                )
+            )
         if cluster_array.shape[0] > X.shape[0]:
-            raise ValueError('The number of suggested clusters to try ({}) is larger than samples in dataset. ({})'
-                             .format(cluster_array.shape[0], X.shape[0])
-                             )
+            raise ValueError(
+                "The number of suggested clusters to try ({}) is larger than samples in dataset. ({})".format(
+                    cluster_array.shape[0], X.shape[0]
+                )
+            )
         if not cluster_array.shape[0]:
-            raise ValueError('The supplied cluster_array has no values.')
+            raise ValueError("The supplied cluster_array has no values.")
 
         # Array of resulting gaps.
-        gap_df = pd.DataFrame({'n_clusters': [], 'gap_value': []})
+        gap_df = pd.DataFrame({"n_clusters": [], "gap_value": []})
 
         # Define the compute engine; all methods take identical args and are generators.
-        if self.parallel_backend == 'joblib':
+        if self.parallel_backend == "joblib":
             engine = self._process_with_joblib
-        elif self.parallel_backend == 'multiprocessing':
+        elif self.parallel_backend == "multiprocessing":
             engine = self._process_with_multiprocessing
-        elif self.parallel_backend == 'rust':
+        elif self.parallel_backend == "rust":
             engine = self._process_with_rust
         else:
             engine = self._process_non_parallel
@@ -87,20 +113,32 @@ def __call__(self, X: Union[pd.DataFrame, np.ndarray], n_refs: int=3, cluster_ar
         for (gap_value, n_clusters) in engine(X, n_refs, cluster_array):
 
             # Assign this loop's gap statistic to gaps
-            gap_df = gap_df.append({'n_clusters': n_clusters, 'gap_value': gap_value}, ignore_index=True)
+            gap_df = gap_df.append(
+                {"n_clusters": n_clusters, "gap_value": gap_value}, ignore_index=True
+            )
 
-        self.gap_df = gap_df.sort_values(by='n_clusters', ascending=True).reset_index(drop=True)
+        self.gap_df = gap_df.sort_values(by="n_clusters", ascending=True).reset_index(
+            drop=True
+        )
         return int(self.gap_df.loc[np.argmax(self.gap_df.gap_value.values)].n_clusters)
 
     @staticmethod
-    def _calculate_dispersion(X: Union[pd.DataFrame, np.ndarray], labels: np.ndarray, centroids: np.ndarray) -> float:
+    def _calculate_dispersion(
+        X: Union[pd.DataFrame, np.ndarray], labels: np.ndarray, centroids: np.ndarray
+    ) -> float:
         """
         Calculate the dispersion between actual points and their assigned centroids
         """
-        disp = np.sum(np.sum([np.abs(inst - centroids[label]) ** 2 for inst, label in zip(X, labels)]))  # type: float
+        disp = np.sum(
+            np.sum(
+                [np.abs(inst - centroids[label]) ** 2 for inst, label in zip(X, labels)]
+            )
+        )  # type: float
         return disp
 
-    def _calculate_gap(self, X: Union[pd.DataFrame, np.ndarray], n_refs: int, n_clusters: int) -> Tuple[float, int]:
+    def _calculate_gap(
+        self, X: Union[pd.DataFrame, np.ndarray], n_refs: int, n_clusters: int
+    ) -> Tuple[float, int]:
         """
         Calculate the gap value of the given data, n_refs, and number of clusters.
         Return the resutling gap value and n_clusters
@@ -115,66 +153,89 @@ def _calculate_gap(self, X: Union[pd.DataFrame, np.ndarray], n_refs: int, n_clus
             random_data = np.random.random_sample(size=X.shape)  # type: np.ndarray
 
             # Fit to it, getting the centroids and labels, and add to accumulated reference dispersions array.
-            centroids, labels = self.clusterer(random_data, n_clusters, **self.clusterer_kwargs)  # type: Tuple[np.ndarray, np.ndarray]
-            dispersion = self._calculate_dispersion(X=random_data, labels=labels, centroids=centroids)  # type: float
+            centroids, labels = self.clusterer(
+                random_data, n_clusters, **self.clusterer_kwargs
+            )  # type: Tuple[np.ndarray, np.ndarray]
+            dispersion = self._calculate_dispersion(
+                X=random_data, labels=labels, centroids=centroids
+            )  # type: float
             ref_dispersions[i] = dispersion
 
         # Fit cluster to original data and create dispersion calc.
-        centroids, labels = self.clusterer(random_data, n_clusters, **self.clusterer_kwargs)  # type: Tuple[np.ndarray, np.ndarray]
+        centroids, labels = self.clusterer(
+            random_data, n_clusters, **self.clusterer_kwargs
+        )  # type: Tuple[np.ndarray, np.ndarray]
         dispersion = self._calculate_dispersion(X=X, labels=labels, centroids=centroids)
 
         # Calculate gap statistic
         gap_value = np.mean(np.log(ref_dispersions)) - np.log(dispersion)
 
         return gap_value, int(n_clusters)
 
-    def _process_with_rust(self, X: Union[pd.DataFrame, np.ndarray], n_refs: int, cluster_array: np.ndarray):
+    def _process_with_rust(
+        self, X: Union[pd.DataFrame, np.ndarray], n_refs: int, cluster_array: np.ndarray
+    ):
         """
         Process gap stat using pure rust
         """
         from gap_statistic.rust import gapstat
+
         for label, gap_value in gapstat.optimal_k(X, list(cluster_array)):
             yield (gap_value, label)
 
-    def _process_with_joblib(self, X: Union[pd.DataFrame, np.ndarray], n_refs: int, cluster_array: np.ndarray):
+    def _process_with_joblib(
+        self, X: Union[pd.DataFrame, np.ndarray], n_refs: int, cluster_array: np.ndarray
+    ):
         """
         Process calling of .calculate_gap() method using the joblib backend
         """
         if Parallel is None:
-            raise EnvironmentError('joblib is not installed; cannot use joblib as the parallel backend!')
+            raise EnvironmentError(
+                "joblib is not installed; cannot use joblib as the parallel backend!"
+            )
 
         with Parallel(n_jobs=self.n_jobs) as parallel:
-            for gap_value, n_clusters in parallel(delayed(self._calculate_gap)(X, n_refs, n_clusters)
-                                                  for n_clusters in cluster_array):
+            for gap_value, n_clusters in parallel(
+                delayed(self._calculate_gap)(X, n_refs, n_clusters)
+                for n_clusters in cluster_array
+            ):
                 yield (gap_value, n_clusters)
 
-    def _process_with_multiprocessing(self, X: Union[pd.DataFrame, np.ndarray], n_refs: int, cluster_array: np.ndarray):
+    def _process_with_multiprocessing(
+        self, X: Union[pd.DataFrame, np.ndarray], n_refs: int, cluster_array: np.ndarray
+    ):
         """
         Process calling of .calculate_gap() method using the multiprocessing library
         """
         with ProcessPoolExecutor(max_workers=self.n_jobs) as executor:
 
-            jobs = [executor.submit(self._calculate_gap, X, n_refs, n_clusters)
-                    for n_clusters in cluster_array
-                    ]
+            jobs = [
+                executor.submit(self._calculate_gap, X, n_refs, n_clusters)
+                for n_clusters in cluster_array
+            ]
 
             for future in as_completed(jobs):
                 gap_value, k = future.result()
                 yield (gap_value, k)
 
-    def _process_non_parallel(self, X: Union[pd.DataFrame, np.ndarray], n_refs: int, cluster_array: np.ndarray):
+    def _process_non_parallel(
+        self, X: Union[pd.DataFrame, np.ndarray], n_refs: int, cluster_array: np.ndarray
+    ):
         """
         Process calling of .calculate_gap() method using no parallel backend; simple for loop generator
         """
-        for gap_value, n_clusters in [self._calculate_gap(X, n_refs, n_clusters)
-                                      for n_clusters in cluster_array]:
+        for gap_value, n_clusters in [
+            self._calculate_gap(X, n_refs, n_clusters) for n_clusters in cluster_array
+        ]:
             yield (gap_value, n_clusters)
 
     def __str__(self):
-        return 'OptimalK(n_jobs={}, parallel_backend="{}")'.format(self.n_jobs, self.parallel_backend)
+        return 'OptimalK(n_jobs={}, parallel_backend="{}")'.format(
+            self.n_jobs, self.parallel_backend
+        )
 
     def __repr__(self):
         return self.__str__()
 
     def _repr_html_(self):
-        return '<p>{}</p>'.format(self.__str__())
+        return "<p>{}</p>".format(self.__str__())
diff --git a/setup.py b/setup.py
@@ -6,49 +6,65 @@
     from setuptools_rust import RustExtension, Binding
 except ImportError:
     import subprocess
-    errno = subprocess.call([sys.executable, '-m', 'pip', 'install', 'setuptools-rust>=0.9.2'])
+
+    errno = subprocess.call(
+        [sys.executable, "-m", "pip", "install", "setuptools-rust>=0.9.2"]
+    )
     if errno:
         print("Please install the 'setuptools-rust>=0.9.2' package")
         raise SystemExit(errno)
     else:
         from setuptools_rust import RustExtension, Binding
 
+install_requires = ["numpy", "pandas", "scipy"]
+
+setup_requires = [
+    "setuptools-rust>=0.9.2",
+    "pytest-runner",
+    "pytest",
+    "scikit-learn",
+    "joblib",
+    "scipy",
+    "pandas",
+]
+tests_require = ["scikit-learn", "pytest", "joblib", "black", "click"]
 
-setup(name='gap-stat',
-      version=__version__,
-      author='Miles Granger',
-      maintainer='Miles Granger',
-      author_email='[email protected]',
-      maintainer_email='[email protected]',
-      keywords='kmeans unsupervised learning machine-learning clustering',
-      description='Python implementation of the gap statistic with Rust optimizations.',
-      long_description='Uses the gap statistic method by Tibshirani, Walther, Hastie to suggest n_clusters.',
-      packages=['gap_statistic'],
-      rust_extensions=[
-          RustExtension('gap_statistic.rust.gapstat', 'Cargo.toml', binding=Binding.PyO3)
-      ],
-      license='BSD',
-      url='https://github.com/milesgranger/gap_statistic',
-      zip_safe=False,
-      setup_requires=['setuptools-rust>=0.9.2', 'pytest-runner', 'pytest', 'scikit-learn', 'joblib', 'scipy', 'pandas'],
-      install_requires=['numpy', 'pandas', 'scipy'],
-      tests_require=['scikit-learn', 'pytest', 'joblib'],
-      test_suite='tests',
-      include_package_data=True,
-      classifiers=[
-            'Development Status :: 5 - Production/Stable',
-            'Intended Audience :: Developers',
-            'Intended Audience :: Financial and Insurance Industry',
-            'Intended Audience :: Information Technology',
-            'Intended Audience :: Science/Research',
-            'Programming Language :: Python',
-            'Programming Language :: Python :: 3.5',
-            'Programming Language :: Python :: 3.6',
-            'Programming Language :: Python :: 3 :: Only',
-            'Programming Language :: Rust',
-            'Operating System :: Microsoft :: Windows',
-            'Operating System :: POSIX',
-            'Operating System :: Unix',
-            'Operating System :: MacOS :: MacOS X',
-      ],
-      )
+setup(
+    name="gap-stat",
+    version=__version__,
+    author="Miles Granger",
+    maintainer="Miles Granger",
+    author_email="[email protected]",
+    maintainer_email="[email protected]",
+    keywords="kmeans unsupervised learning machine-learning clustering",
+    description="Python implementation of the gap statistic with Rust optimizations.",
+    long_description="Uses the gap statistic method by Tibshirani, Walther, Hastie to suggest n_clusters.",
+    packages=["gap_statistic"],
+    rust_extensions=[
+        RustExtension("gap_statistic.rust.gapstat", "Cargo.toml", binding=Binding.PyO3)
+    ],
+    license="BSD",
+    url="https://github.com/milesgranger/gap_statistic",
+    zip_safe=False,
+    setup_requires=setup_requires,
+    install_requires=install_requires,
+    tests_require=tests_require,
+    test_suite="tests",
+    include_package_data=True,
+    classifiers=[
+        "Development Status :: 5 - Production/Stable",
+        "Intended Audience :: Developers",
+        "Intended Audience :: Financial and Insurance Industry",
+        "Intended Audience :: Information Technology",
+        "Intended Audience :: Science/Research",
+        "Programming Language :: Python",
+        "Programming Language :: Python :: 3.5",
+        "Programming Language :: Python :: 3.6",
+        "Programming Language :: Python :: 3 :: Only",
+        "Programming Language :: Rust",
+        "Operating System :: Microsoft :: Windows",
+        "Operating System :: POSIX",
+        "Operating System :: Unix",
+        "Operating System :: MacOS :: MacOS X",
+    ],
+)
diff --git a/tests/test_formatting.py b/tests/test_formatting.py
@@ -0,0 +1,27 @@
+import os
+
+import pytest
+
+
+@pytest.mark.skipif(
+    os.environ.get("AGENT_OS") == "Windows_NT",
+    reason="Black formatting fails on Azure Windows builds.",
+)
+def test_formatting():
+    """
+    Ensure project adheres to black style
+    """
+    import black
+    from click.testing import CliRunner
+
+    print(os.environ)
+
+    proj_path = os.path.join(os.path.dirname(__file__), "..", "gap_statistic")
+    tests_path = os.path.join(os.path.dirname(__file__), "..", "tests")
+    setuppy = os.path.join(os.path.dirname(__file__), "..", "setup.py")
+
+    runner = CliRunner()
+    resp = runner.invoke(black.main, ["--check", "-v", proj_path, tests_path, setuppy])
+    assert resp.exit_code == 0, "Would still reformat one or more files:\n{}".format(
+        resp.output
+    )