Add clustering and spatial plot in integration report

nf-core · Dec 16, 2024 · c9f3dec · c9f3dec
1 parent 138ba83
commit c9f3dec
Show file tree

Hide file tree

Showing 4 changed files with 113 additions and 17 deletions.
diff --git a/bin/integration.qmd b/bin/integration.qmd
@@ -1,5 +1,5 @@
 ---
-title: "Data Pre-processing and Integration with Quality Controls"
+title: "Data integration and batch correction"
 format:
     nf-core-html: default
 jupyter: python3
@@ -17,20 +17,23 @@ before merging them into a single integrated dataset for downstream analysis.
 # Import necessary packages
 import scanpy as sc
 import anndata as an
+import math
 import matplotlib.pyplot as plt
 import seaborn as sns
 import scanorama
 import spatialdata
 import os
 import glob
 import scipy.sparse as sp
+from matplotlib.legend import Legend
 ```
 
 ```{python}
 # Parameter cell
 #| tags: [parameters]
 #| echo: false
 input_sdata = "merged_sdata.zarr"  # Input: SpatialData file
+cluster_resolution = 1  # Resolution for Leiden clustering
 n_hvgs = 2000  # Parameter: Number of HVGs to use for analyses
 artifact_dir = "artifacts" # Parameter: artifact directory
 output_sdata = "integrated_sdata.zarr"  # Output: Integrated SpatialData file
@@ -99,24 +102,25 @@ adata_integrated
 
 # Dimensionality Reduction
 
-Finally, we perform dimensionality reduction to visualize the data.
+We perform dimensionality reduction to visualize the data.
 We compare the UMAP plots before and after batch correction.
 
 ```{python}
 # Create a figure with two subplots
-fig, axes = plt.subplots(1, 2, figsize=(12, 6))
-
+fig, axes = plt.subplots(1, 2, figsize=(12, 6), sharex=True, sharey=True)
+adata_integrated_copy = adata_integrated.copy()
 # Compute neighbors and UMAP before integration
-sc.pp.neighbors(adata_integrated)
-sc.tl.umap(adata_integrated)
+sc.pp.neighbors(adata_integrated_copy)
+sc.tl.umap(adata_integrated_copy)
 
 # Plot UMAP before integration
 sc.pl.umap(
-    adata_integrated,
+    adata_integrated_copy,
     color=["library_id"],
     palette=sc.pl.palettes.default_20,
     ax=axes[0],
-    show=False
+    show=False,
+    legend_loc=None,  # Disable the individual legend
 )
 axes[0].set_title("Before integration")
 
@@ -130,10 +134,93 @@ sc.pl.umap(
     color=["library_id"],
     palette=sc.pl.palettes.default_20,
     ax=axes[1],
-    show=False
+    show=False,
+    legend_loc=None,  # Disable the individual legend
 )
 axes[1].set_title("After integration")
 
+# Add a shared legend at the bottom
+# Extract unique categories and colors from "library_id"
+library_ids = adata_integrated.obs["library_id"].cat.categories
+colors = sc.pl.palettes.default_20[:len(library_ids)]
+
+# Create custom legend elements
+legend_elements = [
+    plt.Line2D([0], [0], marker="o", color=color, linestyle="", markersize=8, label=lib_id)
+    for lib_id, color in zip(library_ids, colors)
+]
+
+# Add the legend to the figure
+fig.legend(
+    handles=legend_elements,
+    loc="lower center",
+    ncol=len(library_ids) // 2,  # Adjust columns based on the number of categories
+    bbox_to_anchor=(0.5, -0.05),  # Position the legend outside the plot
+    frameon=False,
+)
+
+# Adjust layout and display the figure
+plt.tight_layout(rect=[0, 0.1, 1, 1])  # Leave space for the legend
+plt.show()
+```
+
+# Clustering
+
+We perform clustering on the integrated dataset. Principal
+Component Analysis (PCA) is applied to reduce dimensionality. The Leiden
+algorithm is employed for clustering with a given resolution.
+
+```{python}
+sc.pp.pca(adata_integrated)
+sc.pp.neighbors(adata_integrated)
+sc.tl.leiden(adata_integrated, key_added="clusters", resolution=cluster_resolution)
+```
+
+We compare the clusters of the integrated samples before and after batch correction.
+
+```{python}
+# Plot UMAP after integration
+sc.pl.umap(
+    adata_integrated,
+    color=["clusters"],
+    palette=sc.pl.palettes.godsnot_102
+)
+```
+
+# Spatial Visualization
+
+Finally, we visualize the spatial distribution of clusters on the tissue image of all samples.
+
+```{python}
+# We have N samples, we will create a 2xN/2 grid of plots
+n_samples = len(sdata.tables)
+n_cols = math.ceil(n_samples / 2)
+n_rows = 2
+
+# Create a figure with subplots
+fig, axes = plt.subplots(n_rows, n_cols, figsize=(4 * n_cols, 4 * n_rows))
+
+# Flatten the axes for easier indexing
+axes = axes.flatten()
+
+# Plot spatial distribution of clusters for each sample
+for i, (table_name, adata) in enumerate(sdata.tables.items()):
+    sample_name = table_name.replace("_table", "")
+    sc.pl.spatial(
+        adata,
+        img_key="hires",
+        library_id=f"{sample_name}_hires_image",
+        color="clusters",
+        size=1.25,
+        ax=axes[i],
+        show=False,
+    )
+    axes[i].set_title(f"Sample: {sample_name}")
+
+# Hide any remaining axes
+for j in range(i + 1, len(axes)):
+    axes[j].axis("off")
+
 # Adjust layout and display the figure
 plt.tight_layout()
 plt.show()

diff --git a/nextflow.config b/nextflow.config
@@ -34,9 +34,10 @@ params {
     n_top_svgs                = 14
 
     // Data aggregation
-    merge_sdata                  = false
-    integrate_sdata              = false
-    integration_n_hvgs           = 2000
+    merge_sdata                    = false
+    integrate_sdata                = false
+    integration_cluster_resolution = 1.0
+    integration_n_hvgs             = 2000
 
     // MultiQC options
     multiqc_config               = null

diff --git a/nextflow_schema.json b/nextflow_schema.json
@@ -180,6 +180,13 @@
                     "description": "Integrate per-sample SpatialData objects into one and output a integration report.",
                     "fa_icon": "fas fa-arrows-to-dot"
                 },
+                "integration_cluster_resolution": {
+                    "type": "number",
+                    "default": 1,
+                    "description": "The resolution for the clustering of the spots after integration of samples.",
+                    "help_text": "The resolution controls the coarseness of the clustering, where a higher resolution leads to more clusters.",
+                    "fa_icon": "fas fa-circle-nodes"
+                },
                 "integration_n_hvgs": {
                     "type": "integer",
                     "default": 2000,

diff --git a/subworkflows/local/aggregation.nf b/subworkflows/local/aggregation.nf
@@ -44,11 +44,12 @@ workflow AGGREGATION {
     ch_integrated_adata = Channel.empty()
     if (params.integrate_sdata) {
         integration_params = [
-            input_sdata: "merged_sdata.zarr",
-            n_hvgs: params.integration_n_hvgs,
-            artifact_dir: "artifacts",
-            output_adata: "integrated_adata.h5ad",
-            output_sdata: "integrated_sdata.zarr"
+            input_sdata:        "merged_sdata.zarr",
+            cluster_resolution: params.integration_cluster_resolution,
+            n_hvgs:             params.integration_n_hvgs,
+            artifact_dir:       "artifacts",
+            output_adata:       "integrated_adata.h5ad",
+            output_sdata:        "integrated_sdata.zarr"
         ]
         INTEGRATE_SDATA (
             [[id:"integration"], integration_notebook],