Merge pull request #579 from sjspielman/sjspielman/544-doublets-on-scpca

Detect doublets in ScPCA data
AlexsLemonade · Jul 14, 2024 · e885824 · e885824
2 parents f1adf8e + 08d0068
commit e885824
Show file tree

Hide file tree

Showing 13 changed files with 263 additions and 3,096 deletions.
diff --git a/.github/workflows/run_doublet-detection.yml b/.github/workflows/run_doublet-detection.yml
@@ -19,16 +19,17 @@ concurrency:
 on:
   workflow_dispatch:
   workflow_call:
-  #pull_request:
-  #  branches:
-  #    - main
-  #  paths:
-  #    - analyses/doublet-detection/**
+  pull_request:
+    branches:
+      - main
+    paths:
+      - analyses/doublet-detection/**
 
 jobs:
   run-module:
     if: github.repository_owner == 'AlexsLemonade'
     runs-on: ubuntu-latest
+    container: public.ecr.aws/openscpca/doublet-detection:latest
     defaults:
       run:
         shell: bash -el {0}
@@ -37,43 +38,16 @@ jobs:
       - name: Checkout repo
         uses: actions/checkout@v4
 
-      # Eventually, a step to download data will go here.
-
-      - name: Set up R
-        uses: r-lib/actions/setup-r@v2
-        with:
-          r-version: 4.4.0
-          use-public-rspm: true
-
-      - name: Install dependencies needed to build certain R packages
+      - name: Download test data in SCE format
         run: |
-          sudo apt-get update
-          # libcurl4-openssl-dev and lzma-dev are needed for Rhtslib, a dependency of scDblFinder:
-          #    https://github.com/samtools/htslib/blob/30c9c50a874059e3dae7ff8c0ad9e8a9258031c8/INSTALL#L31-L41
-          # libglpk-dev is needed for igraph
-          sudo apt-get install \
-            libcurl4-openssl-dev \
-            lzma-dev \
-            libglpk-dev
-
-
-      - name: Set up pandoc
-        uses: r-lib/actions/setup-pandoc@v2
-
-      - name: Set up renv
-        uses: r-lib/actions/setup-renv@v2
-        with:
-          working-directory: ${{ env.MODULE_PATH }}
-
-      - name: Set up conda
-        uses: conda-incubator/setup-miniconda@v3
-
-      - name: Install conda-lock and activate locked conda environment
-        run: |
-          conda install conda-lock
-          conda-lock install --name test ${MODULE_PATH}/conda-lock.yml
+          conda activate openscpca-doublet-detection
+          ./download-data.py --test-data --format SCE
 
       - name: Run doublet-detection module
         run: |
-          cd $MODULE_PATH
-          bash run_doublet-detection.sh
+          projects=$(basename -a data/current/SCPCP*)
+
+          cd ${MODULE_PATH}
+          for project in $projects; do
+              bash run_doublet-detection-scpca.sh $project
+          done
diff --git a/analyses/doublet-detection/README.md b/analyses/doublet-detection/README.md
@@ -22,17 +22,30 @@ conda-lock install --name openscpca-doublet-detection conda-lock.yml
 conda activate openscpca-doublet-detection
 ```
 
-Then, run the following bash script:
+There are two portions of this module, which can be run as follows:
+
+- Benchmarking analysis: Compare several doublet detection methods on publicly available (non-ScPCA) datasets with annotated doublets.
+This analysis runs several doublet detection methods and assesses and compares their performance.
+  ```sh
+  ./run_doublet-detection-benchmarking.sh
+  ```
+
+- ScPCA analysis: Detect doublets in ScPCA data using `scDblFinder` for a given project id (`SCPCPXXXXXX`).
+  - The conda environment is not needed for this analysis, since it is only R-based
+  ```sh
+  ./run_doublet-detection-scpca.sh {scpca project id}
+
+  # for example, detect doublets on all libraries in ScPCA project SCPCP000001:
+  ./run_doublet-detection-scpca.sh SCPCP000001
+  ```
+
 
-```sh
-./run_doublet-detection.sh
-```
 
 ## Input files
 
 ### Benchmarking input files
 
-This module currently uses input data from [a Zenodo repository](https://doi.org/10.5281/zenodo.4562782) to explore doublet detection methods.
+The benchmarking portion of this module uses input data from [a Zenodo repository](https://doi.org/10.5281/zenodo.4562782) to explore doublet detection methods.
 Specifically, these datasets are used (descriptions from [Xi and Li (2021)](https://doi.org/10.1016/j.cels.2020.11.008)):
 - `hm-6k`
   - A mixture of human `HEK293T` and mouse `NIH3T3` cells with 6806 droplets
@@ -47,24 +60,52 @@ Specifically, these datasets are used (descriptions from [Xi and Li (2021)](http
   - `PBMCs` from a patient with systemic lupus erythematosus
   - Droplets were annotated with `demuxlet`
 
+### ScPCA input files
+
+The ScPCA portion of this module uses `processed` SCE files from the most recent OpenScPCA data release, which can be obtained with:
+
+```sh
+# to be run from the root of the repository
+./download-data.py
+```
+
 
 ## Output files
 
-Below is the results directory structure, annotated with file descriptions:
+Below is the results directory structure, annotated with file descriptions, after both `run_doublet-detection-benchmarking.sh` and `run_doublet-detection-scpca.sh` have been run.
+
 ```
 results
-└── benchmark-results
-    ├── {dataset}_scdblfinder.tsv # TSV files with `scDblFinder` inferences
-    ├── {dataset}_scrublet.tsv    # TSV files with `scrublet` inferences
-    └── rendered-notebooks
-        └── {dataset}-results.nb.html # Exploration of doublet detection results for each individual dataset
+├── README.md
+├── benchmark-results
+│   ├── {dataset}_scdblfinder.tsv # TSV files with `scDblFinder` inferences
+│   ├── {dataset}_scrublet.tsv    # TSV files with `Scrublet` inferences
+│   └── rendered-notebooks # Exploration of doublet detection results for each individual dataset
+│       ├── compare-doublet-results.nb.html # Created from exploratory-notebooks/03_compare-benchmark-results.Rmd
+│       └── {dataset}-doublet-results.nb.html # Created from template-notebooks/02_explore-benchmark-results.Rmd
+└── scpca-results # Results from running `scDblFinder` across ScPCA projects
+    └── {project id}
+        └── {sample id}
+            └── {library id}_processed_scdblfinder.tsv # TSV file with doublet results
 ```
 
+### Result TSV files
+
+The benchmarking TSV files have the following columns:
+
+- `{dataset}_scdblfinder.tsv`: `barcodes`, `score`, `class`, `cxds_score`
+    - `score` and `class` are the `scDblFinder` score and prediction columns, respectively, and `cxds_score` is a modified version of the [`scds::cxds` score](https://bioconductor.org/packages/devel/bioc/vignettes/scds/inst/doc/scds.html) as calculated by `scDblFinder`
+- `{dataset}_scrublet.tsv`: `barcodes`, `scrublet_score`, `scrublet_prediction`
+
+The ScPCA TSV files for `scDblFinder` results have only the columns `barcodes`, `score`, and `class`.
+Note that SCEs with fewer than 10 droplets will not be run through `scDblFinder`, so their associated result TSVs will contain `NA` values in the `score` and `class` columns.
+
+
 ## Software requirements
 
 This module uses both `renv` and `conda` to manage software dependencies.
-TODO: NEEDS UPDATING! A Dockerfile created using [these guidelines](https://openscpca.readthedocs.io/en/latest/software-platforms/docker/docker-images/#r-based-images) is also provided.
+A Dockerfile is also provided.
 
 ## Computational resources
 
-_Forthcoming._
+This module does not require compute beyond what is generally available on a laptop.
diff --git a/analyses/doublet-detection/exploratory-notebooks/03_compare-benchmark-results.Rmd b/analyses/doublet-detection/exploratory-notebooks/03_compare-benchmark-results.Rmd
@@ -163,7 +163,7 @@ doublet_df_list <- dataset_names |>
       
       scdbl_tsv <- file.path(result_dir, glue::glue("{dataset}_scdblfinder.tsv"))
       scrub_tsv <- file.path(result_dir, glue::glue("{dataset}_scrublet.tsv"))
-      sce_file <- file.path(data_dir, dataset, glue::glue("{dataset}_sce.rds"))
+      sce_file <- file.path(data_dir, dataset, glue::glue("{dataset}.rds"))
       
       scdbl_df <- scdbl_tsv |>
         readr::read_tsv(show_col_types = FALSE) |>

diff --git a/analyses/doublet-detection/exploratory-notebooks/03_compare-benchmark-results.nb.html b/analyses/doublet-detection/exploratory-notebooks/03_compare-benchmark-results.nb.html
diff --git a/analyses/doublet-detection/results/README.md b/analyses/doublet-detection/results/README.md
@@ -4,3 +4,6 @@ Results files are available in `s3://researcher-654654257431-us-east-2/doublet-d
 
 - `benchmark-results` contains TSV files with `scDblFinder` and `scrublet` inferences on ground-truth benchmarking datasets
 - `benchmark-results/exploratory-notebooks` contains knitted notebooks exploring the benchmarking results
+- `scpca-results` contains TSV files with `scDblFinder` results for ScPCA data
+    - This directory is organized by project and sample IDs as:`scpca-results/{project id}/{sample id}/{library id}_processed_scdblfinder.tsv`
+    - Currently, the directory contains only results for `SCPCP000001`
diff --git a/analyses/doublet-detection/run_doublet-detection-benchmark.sh b/analyses/doublet-detection/run_doublet-detection-benchmark.sh
@@ -0,0 +1,62 @@
+#!/bin/bash
+
+# This script runs the benchmarking portion of the `doublet-detection` module
+
+set -euo pipefail
+
+
+# Set up --------------
+
+# Ensure script is being run from its directory
+MODULE_DIR=$(dirname "${BASH_SOURCE[0]}")
+cd ${MODULE_DIR}
+
+TEMPLATE_NB_DIR="template-notebooks" # directory with template notebooks
+EXPLORE_NB_DIR="exploratory-notebooks"  # directory with exploratory notebooks
+
+# Create benchmark directories
+DATA_DIR="scratch/benchmark-datasets"
+RESULTS_DIR="results/benchmark-results"
+RESULTS_NB_DIR="${RESULTS_DIR}/rendered-notebooks"
+mkdir -p ${DATA_DIR}
+mkdir -p ${RESULTS_DIR}
+mkdir -p ${RESULTS_NB_DIR}
+
+
+# define benchmarking datasets to use
+bench_datasets=("hm-6k" "pbmc-1B-dm" "pdx-MULTI" "HMEC-orig-MULTI")
+
+# Download and unzip `real_datasets.zip` archive from https://doi.org/10.5281/zenodo.4562782
+# Files are saved in $DATA_DIR/raw
+wget https://zenodo.org/records/4562782/files/real_datasets.zip
+unzip real_datasets.zip -d ${DATA_DIR}/raw
+rm real_datasets.zip
+
+for dataset in "${bench_datasets[@]}"; do
+
+    # formatted SCE and AnnData files will be saved here
+    DATASET_DIR=${DATA_DIR}/$dataset
+    mkdir -p $DATASET_DIR
+
+    # Read raw downloaded data and export SCE, AnnData files
+    ./scripts/00_format-benchmark-data.R --dataset ${dataset} --input_dir ${DATA_DIR}/raw --output_dir ${DATASET_DIR}
+
+    # Infer doublets with scDblFinder
+    ./scripts/01a_run-scdblfinder.R --input_sce_file ${DATASET_DIR}/${dataset}.rds --results_dir ${RESULTS_DIR} --benchmark
+
+    # Infer doublets with scrublet
+    ./scripts/01b_run-scrublet.py --input_anndata_file ${DATASET_DIR}/${dataset}.h5ad --results_dir ${RESULTS_DIR}
+
+    # Explore each individual set of doublet results
+    Rscript -e "rmarkdown::render('${TEMPLATE_NB_DIR}/02_explore-benchmark-results.Rmd',
+            output_dir = '${RESULTS_NB_DIR}',
+            output_file = '${dataset}_doublet-results.html',
+            params = list(dataset = '${dataset}'),
+            clean = TRUE)"
+done
+
+# Compare doublet inferences across methods, on all datasets processed
+Rscript -e "rmarkdown::render('${EXPLORE_NB_DIR}/03_compare-benchmark-results.Rmd',
+            output_dir = '${RESULTS_NB_DIR}',
+            output_file = 'compare-benchmark-results.nb.html',
+            clean = TRUE)"
diff --git a/analyses/doublet-detection/run_doublet-detection-scpca.sh b/analyses/doublet-detection/run_doublet-detection-scpca.sh
@@ -0,0 +1,33 @@
+#!/bin/bash
+
+# This script runs doublet detection over ScPCA data for a given project
+# Usage: ./run_doublet-detection-scpca.sh {scpca project id}
+
+set -euo pipefail
+
+
+# Ensure script is being run from its directory
+MODULE_DIR=$(dirname "${BASH_SOURCE[0]}")
+cd ${MODULE_DIR}
+
+PROJECT_ID=$1
+
+# Define directories
+DATA_DIR="../../data/current"
+RESULTS_DIR="results/scpca-results/${PROJECT_ID}"
+mkdir -p ${RESULTS_DIR}
+
+# Detect doublets on each processed SCE file in each sample directory
+for SAMPLE_DIR in ${DATA_DIR}/${PROJECT_ID}/SCPCS*; do
+    SAMPLE_ID=$(basename $SAMPLE_DIR)
+    echo "Processing ${SAMPLE_ID}..."
+
+    SAMPLE_RESULTS_DIR=${RESULTS_DIR}/${SAMPLE_ID}
+    mkdir -p ${SAMPLE_RESULTS_DIR}
+
+    for SCE_FILE in ${SAMPLE_DIR}/*_processed.rds; do
+        Rscript scripts/01a_run-scdblfinder.R \
+            --input_sce_file ${SCE_FILE} \
+            --results_dir ${SAMPLE_RESULTS_DIR}
+    done
+done
diff --git a/analyses/doublet-detection/run_doublet-detection.sh b/analyses/doublet-detection/run_doublet-detection.sh
diff --git a/analyses/doublet-detection/scripts/00_format-benchmark-data.R b/analyses/doublet-detection/scripts/00_format-benchmark-data.R
@@ -12,10 +12,7 @@
 #   - In the SCE file, this is in the colData slot
 #   - In the AnnData file, this is in the obs slot
 
-# Load renv environment and libraries
-project_root <- rprojroot::find_root(rprojroot::is_renv_project)
-renv::load(project_root)
-
+# Load libraries
 library(SingleCellExperiment)
 library(optparse)
 
@@ -47,8 +44,8 @@ if (!file.exists(input_file)) {
   )
 }
 
-output_sce_file <- file.path(opts$output_dir, glue::glue("{opts$dataset_name}_sce.rds"))
-output_anndata_file <- file.path(opts$output_dir, glue::glue("{opts$dataset_name}_anndata.h5ad"))
+output_sce_file <- file.path(opts$output_dir, glue::glue("{opts$dataset_name}.rds"))
+output_anndata_file <- file.path(opts$output_dir, glue::glue("{opts$dataset_name}.h5ad"))
 
 dat <- readRDS(input_file)
 mat <- dat[[1]] # raw counts matrix