neurostuff · tsalo · Jun 30, 2021 · Jun 30, 2021 · Jun 30, 2021 · Jul 1, 2021
diff --git a/nimare/dataset.py b/nimare/dataset.py
@@ -548,7 +548,8 @@ def get_studies_by_label(self, labels=None, label_threshold=0.001):
             the labels above the threshold, it will be returned.
             Default is None.
         label_threshold : :obj:`float`, optional
-            Default is 0.5.
+            Default is 0.001, which corresponds to terms appearing at least once in abstracts
+            for the commonly-used Neurosynth TF-IDF feature set.
 
         Returns
         -------
@@ -600,27 +601,50 @@ def get_studies_by_mask(self, mask):
         found_ids = list(self.coordinates.loc[distances, "id"].unique())
         return found_ids
 
-    def get_studies_by_coordinate(self, xyz, r=20):
+    def get_studies_by_coordinate(self, xyz, r=None, n=None):
         """Extract list of studies with at least one focus within radius of requested coordinates.
 
+        .. versionchanged:: 0.0.9
+
+            New option (n) supported to identify closest n studies to coordinate.
+            Default value for r changed to None.
+
         Parameters
         ----------
         xyz : (X x 3) array_like
             List of coordinates against which to find studies.
         r : :obj:`float`, optional
-            Radius (in mm) within which to find studies. Default is 20mm.
+            Radius (in mm) within which to find studies. Mutually exclusive with ``n``.
+            Default is None.
+        n : :obj:`int`, optional
+            Number of closest studies to identify. Mutually exclusive with ``r``.
+            Default is None.
 
         Returns
         -------
         found_ids : :obj:`list`
-            A list of IDs from the Dataset with at least one focus within
-            radius r of requested coordinates.
+            A list of IDs from the Dataset matching the search criterion.
         """
+
+        if n and r:
+            raise ValueError("Only one of 'r' and 'n' may be provided.")
+        elif not n and not r:
+            raise ValueError("Either 'r' or 'n' must be provided.")
+
         from scipy.spatial.distance import cdist
 
-        xyz = np.array(xyz)
+        temp_coords = self.coordinates.copy()
+
+        xyz = np.atleast_2d(xyz)
         assert xyz.shape[1] == 3 and xyz.ndim == 2
-        distances = cdist(xyz, self.coordinates[["x", "y", "z"]].values)
-        distances = np.any(distances <= r, axis=0)
-        found_ids = list(self.coordinates.loc[distances, "id"].unique())
+        distances = cdist(xyz, temp_coords[["x", "y", "z"]].values)
+        distances = np.min(distances, axis=0)
+        temp_coords["distance"] = distances
+        min_dist_ser = temp_coords.groupby("id")["distance"].min()
+
+        if r:
+            found_ids = min_dist_ser.loc[min_dist_ser <= r].index.tolist()
+        elif n:
+            found_ids = min_dist_ser.nsmallest(n).index.tolist()
+
         return found_ids
diff --git a/nimare/parcellate.py b/nimare/parcellate.py
@@ -0,0 +1,166 @@
+"""Parcellation tools."""
+import inspect
+import logging
+
+import numpy as np
+from sklearn.cluster import KMeans
+
+from .base import NiMAREBase
+from .meta.base import CBMAEstimator
+from .meta.cbma.ale import ALE
+from .results import MetaResult
+from .utils import add_metadata_to_dataframe, check_type, listify, use_memmap, vox2mm
+
+LGR = logging.getLogger(__name__)
+
+
+class CoordCBP(NiMAREBase):
+    """Perform coordinate-based coactivation-based parcellation.
+
+    .. versionadded:: 0.0.10
+
+    Parameters
+    ----------
+    target_mask : :obj:`nibabel.Nifti1.Nifti1Image`
+        Mask of target of parcellation.
+        Currently must be in same space/resolution as Dataset mask.
+    n_clusters : :obj:`list` of :obj:`int`
+        Number of clusters to evaluate in clustering.
+    r : :obj:`float` or None, optional
+        Radius (in mm) within which to find studies. Mutually exclusive with ``n``.
+        Default is None.
+    n : :obj:`int` or None, optional
+        Number of closest studies to identify. Mutually exclusive with ``r``.
+        Default is None.
+    meta_estimator : :obj:`nimare.meta.base.CBMAEstimator`, optional
+        CBMA Estimator with which to run the MACMs.
+        Default is :obj:`nimare.meta.cbma.ale.ALE`.
+    target_image : :obj:`str`, optional
+        Name of meta-analysis results image to use for clustering.
+        Default is "ale", which is specific to the ALE estimator.
+    """
+
+    _required_inputs = {"coordinates": ("coordinates", None)}
+
+    def __init__(
+        self,
+        target_mask,
+        n_clusters,
+        r=None,
+        n=None,
+        meta_estimator=None,
+        target_image="ale",
+        *args,
+        **kwargs,
+    ):
+        super().__init__(*args, **kwargs)
+
+        if meta_estimator is None:
+            meta_estimator = ALE()
+        else:
+            meta_estimator = check_type(meta_estimator, CBMAEstimator)
+
+        if r and n:
+            raise ValueError("Only one of 'r' and 'n' may be provided.")
+        elif not r and not n:
+            raise ValueError("Either 'r' or 'n' must be provided.")
+
+        self.meta_estimator = meta_estimator
+        self.target_image = target_image
+        self.n_clusters = listify(n_clusters)
+        self.r = r
+        self.n = n
+
+    def _preprocess_input(self, dataset):
+        """Mask required input images using either the dataset's mask or the estimator's.
+
+        Also, insert required metadata into coordinates DataFrame.
+        """
+        super()._preprocess_input(dataset)
+
+        # All extra (non-ijk) parameters for a kernel should be overrideable as
+        # parameters to __init__, so we can access them with get_params()
+        kt_args = list(self.kernel_transformer.get_params().keys())
+
+        # Integrate "sample_size" from metadata into DataFrame so that
+        # kernel_transformer can access it.
+        if "sample_size" in kt_args:
+            self.inputs_["coordinates"] = add_metadata_to_dataframe(
+                dataset,
+                self.inputs_["coordinates"],
+                metadata_field="sample_sizes",
+                target_column="sample_size",
+                filter_func=np.mean,
+            )
+
+    @use_memmap(LGR, n_files=1)
+    def _fit(self, dataset):
+        """Perform coordinate-based coactivation-based parcellation on dataset.
+
+        Parameters
+        ----------
+        dataset : :obj:`nimare.dataset.Dataset`
+            Dataset to analyze.
+        """
+        self.dataset = dataset
+        self.masker = self.masker or dataset.masker
+        self.null_distributions_ = {}
+
+        # Loop through voxels in target_mask, selecting studies for each and running MACMs (no MCC)
+        target_ijk = np.vstack(np.where(self.target_mask.get_fdata()))
+        target_xyz = vox2mm(target_ijk, self.masker.mask_img.affine)
+        for i_coord in target_xyz.shape[1]:
+            xyz = target_xyz[:, i_coord]
+            macm_ids = dataset.get_studies_by_coordinate(xyz, r=self.r, n=self.n)
+            coord_dset = dataset.slice(macm_ids)
+
+            # This seems like a somewhat inelegant solution
+            # Check if the meta method is a pairwise estimator
+            if "dataset2" in inspect.getfullargspec(self.meta_estimator.fit).args:
+                unselected_ids = sorted(list(set(dataset.ids) - set(macm_ids)))
+                unselected_dset = dataset.slice(unselected_ids)
+                self.meta_estimator.fit(coord_dset, unselected_dset)
+            else:
+                self.meta_estimator.fit(coord_dset)
+            macm_data = self.meta_estimator.results.get_map(self.target_image, return_type="array")
+            if i_coord == 0:
+                data = np.zeros((target_xyz.shape[1], len(macm_data)))
+            data[i_coord, :] = macm_data
+
+        # Correlate voxel-wise MACM results
+        data = np.corrcoef(data)
+
+        # Convert voxel-wise correlation matrix to distances
+        data = 1 - data
+
+        # Perform clustering
+        labels = np.zeros((len(self.n_clusters), len(macm_data)), dtype=int)
+        for i_cluster, cluster_count in enumerate(self.n_clusters):
+            kmeans = KMeans(n_clusters=cluster_count, random_state=0).fit(data)
+            labels[i_cluster, :] = kmeans.labels_
+
+        images = {"labels": labels}
+        return images
+
+    def fit(self, dataset, drop_invalid=True):
+        """Perform coordinate-based coactivation-based parcellation on dataset.
+
+        Parameters
+        ----------
+        dataset : :obj:`nimare.dataset.Dataset`
+            Dataset to analyze.
+        drop_invalid : :obj:`bool`, optional
+            Whether to automatically ignore any studies without the required data or not.
+            Default is True.
+        """
+        self._validate_input(dataset, drop_invalid=drop_invalid)
+        self._preprocess_input(dataset)
+        maps = self._fit(dataset)
+
+        if hasattr(self, "masker") and self.masker is not None:
+            masker = self.masker
+        else:
+            masker = dataset.masker
+
+        self.results = MetaResult(self, masker, maps)
+        return self.results
diff --git a/nimare/tests/test_dataset.py b/nimare/tests/test_dataset.py
@@ -3,6 +3,7 @@
 
 import nibabel as nib
 import numpy as np
+import pytest
 
 import nimare
 from nimare import dataset
@@ -23,7 +24,15 @@ def test_dataset_smoke():
     assert isinstance(dset.get_images(imtype="beta"), list)
     assert isinstance(dset.get_metadata(field="sample_sizes"), list)
     assert isinstance(dset.get_studies_by_label("cogat_cognitive_control"), list)
-    assert isinstance(dset.get_studies_by_coordinate(np.array([[20, 20, 20]])), list)
+    assert isinstance(dset.get_studies_by_coordinate([20, 20, 20], r=20), list)
+    assert len(dset.get_studies_by_coordinate([20, 20, 20], n=2)) == 2
+    assert len(dset.get_studies_by_coordinate([20, 20, 20], n=3)) == 3
+    with pytest.raises(ValueError):
+        dset.get_studies_by_coordinate([20, 20, 20])
+
+    with pytest.raises(ValueError):
+        dset.get_studies_by_coordinate([20, 20, 20], r=20, n=4)
+
     mask_data = np.zeros(dset.masker.mask_img.shape, int)
     mask_data[40, 40, 40] = 1
     mask_img = nib.Nifti1Image(mask_data, dset.masker.mask_img.affine)