From 970a063bd968ea25ff6b051139a134a6fd8802a9 Mon Sep 17 00:00:00 2001
From: Ally Hawkins <ally.hawkins@ccdatalab.org>
Date: Fri, 19 Jul 2024 14:48:42 -0500
Subject: [PATCH 01/10] template for evaluating singler

---
 .../singler-workflow/01-singler-results.Rmd   | 335 ++++++++++++++++++
 1 file changed, 335 insertions(+)
 create mode 100644 analyses/cell-type-ewings/template_notebooks/singler-workflow/01-singler-results.Rmd

diff --git a/analyses/cell-type-ewings/template_notebooks/singler-workflow/01-singler-results.Rmd b/analyses/cell-type-ewings/template_notebooks/singler-workflow/01-singler-results.Rmd
new file mode 100644
index 000000000..5e86ea5e9
--- /dev/null
+++ b/analyses/cell-type-ewings/template_notebooks/singler-workflow/01-singler-results.Rmd
@@ -0,0 +1,335 @@
+---
+title: "Summary of tumor cell classification with `SingleR` for `r params$library_id`"
+author: Ally Hawkins
+date: "`r Sys.Date()`"
+output:
+  html_document:
+    toc: true
+    toc_depth: 3
+    code_folding: "hide"
+params:
+  library_id: NULL
+  sce_file: NULL
+  singler_results_file: NULL
+  geneset_scores_file: NULL
+  marker_genes_file: NULL
+---
+
+This report summarizes the classification of `r params$library_id` using the following approach: 
+
+- Tumor cells were identified by using `AUCell`. 
+These cells are labeled as `tumor_original`. 
+- All other cells were labeled using `SingleR` with a combination of three references: 
+  - All tumor cells annotated by `AUCell` for all other libraries in `SCPCP000015`. 
+  Any cells that are labeled as tumor using this reference are labeled as `tumor_new`. 
+  - `BlueprintEncodeData` from `celldex`
+  - `HumanPrimaryCellAtlasData` from `celldex`
+
+
+## Setup
+
+```{r}
+# check that sce and results files exist 
+stopifnot(
+  "sce file does not exist" = file.exists(params$sce_file),
+  "SingleR results file does not exist" = file.exists(params$singler_results_file), 
+  "Gene set scores file does not exist" = file.exists(params$geneset_scores_file),
+  "Marker genes file does not exist" = file.exists(params$marker_genes_file)
+)
+```
+
+
+```{r packages}
+suppressPackageStartupMessages({
+  # load required packages
+  library(SingleCellExperiment)
+  library(ggplot2)
+})
+
+# Set default ggplot theme
+theme_set(
+  theme_bw()
+)
+
+# quiet messages
+options(readr.show_col_types = FALSE)
+ComplexHeatmap::ht_opt(message = FALSE)
+```
+
+
+```{r base paths}
+# The path to this module
+module_base <- rprojroot::find_root(rprojroot::is_renv_project)
+
+# source in helper functions: plot_gene_heatmap() and plot_cnv_heatmap() 
+# create_classification_df() and create_marker_gene_df()
+validation_functions <- file.path(module_base, "scripts", "utils", "tumor-validation-helpers.R")
+source(validation_functions)
+```
+
+```{r}
+# Read in files
+sce <- readr::read_rds(params$sce_file)
+singler_results_df <- readr::read_tsv(params$singler_results_file)
+geneset_scores_df <- readr::read_tsv(params$geneset_scores_file)
+```
+
+## Prep for plots
+
+```{r}
+# original library annotation 
+orig_tumor <- glue::glue("tumor-{params$library_id}")
+
+# generate classification df to use for plots 
+classification_df <- sce |> 
+  scuttle::makePerCellDF(use.dimred = "UMAP") |> 
+  # replace UMAP.1 with UMAP1
+  dplyr::rename_with(
+    \(x) stringr::str_replace(x, "^UMAP\\.", "UMAP")
+  ) |>
+  # get rid of excess columns
+  dplyr::select(barcodes, UMAP1, UMAP2) |> 
+  # join with previous annotations, singler results, and gene set scores 
+  dplyr::left_join(singler_results_df, by = "barcodes") |> 
+  dplyr::left_join(geneset_scores_df, by = "barcodes") |> 
+  dplyr::mutate(
+    # first remove other library IDs from tumor annotations
+    # we only care about whether or not it was from the original library or a new library
+    singler_tumor_annotation = dplyr::case_when(
+      singler_tumor_annotation == orig_tumor ~ "tumor_original",
+      stringr::str_detect(singler_tumor_annotation, "tumor") ~ "tumor_new",
+      .default = singler_tumor_annotation
+    ),
+    # get the top cell types for plotting later 
+    singler_tumor_annotation_lumped = dplyr::if_else(is.na(singler_tumor_annotation), "Not labeled", singler_tumor_annotation) |>
+          # there are warnings because we have some that have less than 7 cell types (ones that only did tumor/normal)
+          # this is why we ignore warnings in this chunk 
+          forcats::fct_lump_n(7, other_level = "All remaining cell types", ties.method = "first") |>
+          forcats::fct_infreq() |>
+          forcats::fct_relevel("All remaining cell types", after = Inf), 
+    # make a label that combines tumor_new with tumor_original
+    singler_combined_tumor = dplyr::if_else(
+      singler_tumor_annotation_lumped %in% c("tumor_original", "tumor_new"), 
+      "tumor", 
+      singler_tumor_annotation_lumped
+      ) |> 
+      forcats::fct_relevel("tumor", after = 0) |> 
+      forcats::fct_relevel("All remaining cell types", after = Inf),
+    # make a label that is just tumor or normal
+    tumor_normal_classification = dplyr::if_else(singler_combined_tumor == "tumor", "tumor", "normal") |> 
+      forcats::fct_relevel("tumor", after = 0)
+  )
+```
+
+
+```{r}
+# read in marker genes table 
+marker_genes_df <- readr::read_tsv(params$marker_genes_file, show_col_types = FALSE) |> 
+  # account for genes being from multiple sources
+  dplyr::select(cell_type, ensembl_gene_id, gene_symbol) |> 
+  dplyr::distinct()
+
+# get list of all cell types found
+cell_types <- unique(marker_genes_df$cell_type)
+
+# get the sum of expression of all genes for each cell type
+gene_exp_df <- cell_types |> 
+  purrr::map(\(type){
+    calculate_sum_markers(marker_genes_df, type)
+  }) |> 
+  purrr::reduce(dplyr::inner_join, by = "barcodes")
+
+# join sum expression columns with classification df 
+classification_df <- classification_df |> 
+  dplyr::left_join(gene_exp_df)
+```
+
+```{r}
+# get columns that correspond to marker genes and gene sets to help with plotting later 
+marker_gene_columns <- colnames(classification_df)[which(endsWith(colnames(classification_df), "_sum"))]
+geneset_columns <- colnames(classification_df)[which(startsWith(colnames(classification_df), "mean-"))]
+```
+
+## UMAP of all cell type annotations 
+
+Here we show the annotated cell types on a UMAP. 
+The top 7 cell types are shown for each reference used and all other cells are lumped together. 
+
+```{r, fig.height=8, fig.width=8}
+ggplot(classification_df, aes(x = UMAP1, y = UMAP2, color = singler_tumor_annotation_lumped)) +
+  # set points for all "other" points
+  geom_point(
+    data = dplyr::select(
+      classification_df, -singler_tumor_annotation_lumped
+    ),
+    color = "gray80",
+    alpha = 0.5,
+    size = 0.1
+  ) +
+  # set points for desired cell type
+  geom_point(size = 0.1, alpha = 0.5) +
+  facet_wrap(
+    vars(singler_tumor_annotation_lumped),
+    ncol = 3
+  ) +
+  scale_color_brewer(palette = "Dark2") +
+  # remove axis numbers and background grid
+  scale_x_continuous(labels = NULL, breaks = NULL) +
+  scale_y_continuous(labels = NULL, breaks = NULL) +
+  guides(
+    color = guide_legend(
+      title = "Cell type",
+      # more visible points in legend
+      override.aes = list(
+        alpha = 1,
+        size = 1.5
+      )
+    )) +
+  theme(
+        legend.position = c(0.67, 0.33),
+        legend.justification = c("left", "top"),
+        legend.title.align = 0.5,
+        # use slightly smaller legend text, which helps legend fit and prevents
+        #  long wrapped labels from bunching up
+        legend.text = element_text(size = rel(0.85)),
+        legend.key.height = unit(0.75, "cm"),
+        aspect.ratio = 1
+      )
+```
+
+## Evaluate output from `SingleR`
+
+The next three sections will evaluate the annotations by looking at expression of marker genes and gene set scores. 
+
+The gene expression for all marker genes for tumor cells, endothelial cells, mesenchymal-like cells, and immune cells was summed together for each cell. 
+Marker genes were obtained from [Visser et al.](https://doi.org/10.1158/2767-9764.CRC-23-0027). 
+
+Gene set scores are determined by calculating the mean expression of all genes in a given gene set. 
+Scores are calculated for the following gene sets from `MsigDb`:
+
+  - [`ZHANG_TARGETS_OF_EWSR1_FLI1_FUSION`](https://www.gsea-msigdb.org/gsea/msigdb/human/geneset/ZHANG_TARGETS_OF_EWSR1_FLI1_FUSION.html)
+  - [`RIGGI_EWING_SARCOMA_PROGENITOR_UP`](https://www.gsea-msigdb.org/gsea/msigdb/human/geneset/RIGGI_EWING_SARCOMA_PROGENITOR_UP.html?ex=1)
+  - [`SILIGAN_TARGETS_OF_EWS_FLI1_FUSION_DN`](https://www.gsea-msigdb.org/gsea/msigdb/cards/SILIGAN_TARGETS_OF_EWS_FLI1_FUSION_DN)
+
+For both marker gene expression and gene set scores, the following plots will be shown: 
+
+- Heatmap showing gene expression or gene set scores across all cells. 
+Here the rows correspond to the gene list and the columns are the cells. 
+The annotations are shown below the heatmap. 
+- Density plot showing gene expression or gene set scores across all cells. 
+Each row is a cell type and the expression or score is plotted on the x-axis. 
+
+
+### Tumor vs. Normal 
+
+In this section we show just the cells that are considered tumor (including both `tumor_original` and `tumor_new`) and normal, lumping all non-tumor cell types together. 
+
+**Marker gene expression**
+
+```{r}
+full_celltype_heatmap(classification_df, marker_gene_columns, "tumor_normal_classification")
+```
+
+
+```{r}
+plot_density(classification_df,
+             "tumor_sum",
+             "tumor_normal_classification")
+```
+
+**Gene set scores**
+
+```{r}
+full_celltype_heatmap(classification_df, geneset_columns, "tumor_normal_classification")
+```
+
+```{r}
+geneset_columns |> 
+  purrr::map(\(column){
+    plot_density(classification_df,
+                 column,
+                 "tumor_normal_classification")
+  })
+```
+
+
+### Tumor vs. all other cell types 
+
+In this section we show all tumor cells together (including both `tumor_original` and `tumor_new`) and the top 5 most represented normal cell types.
+
+**Marker gene expression** 
+
+```{r}
+full_celltype_heatmap(classification_df, marker_gene_columns, "singler_combined_tumor")
+```
+
+```{r}
+marker_gene_columns |> 
+  purrr::map(\(column){
+    plot_density(classification_df,
+                 column,
+                 "singler_combined_tumor")
+  })
+```
+
+
+**Gene set scores**
+
+```{r}
+full_celltype_heatmap(classification_df, geneset_columns, "singler_combined_tumor")
+```
+
+```{r}
+geneset_columns |> 
+  purrr::map(\(column){
+    plot_density(classification_df,
+                 column,
+                 "singler_combined_tumor")
+  })
+```
+
+### Compare newly annotated tumor cells to existing tumor cells
+
+In this section we show both tumor cell types, `tumor_original` and `tumor_new`, separately and the top 5 most represented normal cell types.
+
+**Marker gene expression**
+
+```{r}
+full_celltype_heatmap(classification_df, marker_gene_columns, "singler_tumor_annotation_lumped")
+```
+
+
+```{r}
+marker_gene_columns |> 
+  purrr::map(\(column){
+    plot_density(classification_df,
+                 column,
+                 "singler_tumor_annotation_lumped")
+  })
+```
+
+
+**Gene set scores**
+
+```{r}
+full_celltype_heatmap(classification_df, geneset_columns, "singler_tumor_annotation_lumped")
+```
+
+```{r}
+geneset_columns |> 
+  purrr::map(\(column){
+    plot_density(classification_df,
+                 column,
+                 "singler_tumor_annotation_lumped")
+  })
+```
+
+
+## Session Info
+
+```{r session info}
+# record the versions of the packages used in this analysis and other environment information
+sessionInfo()
+```
+
+

From a661afaa231e685fa2a2044f95f233a4b0841144 Mon Sep 17 00:00:00 2001
From: Ally Hawkins <ally.hawkins@ccdatalab.org>
Date: Fri, 19 Jul 2024 14:49:22 -0500
Subject: [PATCH 02/10] add plotting functions

---
 .../scripts/utils/tumor-validation-helpers.R  | 121 ++++++++++++++++++
 1 file changed, 121 insertions(+)

diff --git a/analyses/cell-type-ewings/scripts/utils/tumor-validation-helpers.R b/analyses/cell-type-ewings/scripts/utils/tumor-validation-helpers.R
index 5be97e942..b028fb7aa 100644
--- a/analyses/cell-type-ewings/scripts/utils/tumor-validation-helpers.R
+++ b/analyses/cell-type-ewings/scripts/utils/tumor-validation-helpers.R
@@ -117,6 +117,44 @@ create_marker_gene_df <- function(
   return(plot_markers_df)
 }
 
+# calculate the sum of expression for all markers in a given cell type 
+# takes as input the marker gene df with `cell_type` and `ensembl_gene_id` as columns
+# For any genes that are in the specified `cell_type`, sum of the logcounts is calculated
+# output is a data frame with barcodes and `{cell_type}_sum`
+calculate_sum_markers <- function(marker_genes_df,
+                                  type){
+  
+  # get list of marker genes to use 
+  marker_genes <- marker_genes_df |> 
+    dplyr::filter(cell_type == type) |> 
+    dplyr::pull(ensembl_gene_id)
+  
+  # get the gene expression counts for all marker genes
+  sum_exp <- logcounts(sce[marker_genes, ]) |>
+    as.matrix() |>
+    t() |>
+    rowSums() 
+  
+  df <- data.frame(
+    barcodes = names(sum_exp), 
+    sum_exp = sum_exp
+  )
+  
+  # get rid of extra " cells" at end of some of the names 
+  type <- stringr::str_remove(type, " cells")
+  
+  colnames(df) <- c(
+    "barcodes",
+    # add ref name to colnames for easier joining
+    glue::glue("{type}_sum")
+  )
+  
+  return(df)
+}
+
+
+
+
 # Heatmaps ---------------------------------------------------------------------
 
 # create a heatmap where rows are marker genes or gene sets and columns are cells 
@@ -199,3 +237,86 @@ plot_cnv_heatmap <- function(
   )
   return(heatmap)
 }
+
+# create heatmap with gene set as rows and cells as columns 
+# this includes adding an annotation column labeling cells 
+# colors are automatically determined using the `Dark2` palette and assigning to cell types listed in the `annotation_column`
+full_celltype_heatmap <- function(classification_df,
+                                  gene_exp_columns,
+                                  annotation_column){
+  
+  # get list of all cell types being plotted and assign colors from Dark2 palette 
+  cell_types <- unique(classification_df[[annotation_column]])
+  num_cell_types <- length(cell_types)
+  colors <- palette.colors(palette = "Dark2") |> 
+    head(n = num_cell_types) |> 
+    purrr::set_names(cell_types)
+  
+  # create annotation for heatmap
+  annotation <- ComplexHeatmap::columnAnnotation(
+    singler = classification_df[[annotation_column]],
+    col = list(
+      singler = colors
+    )
+  )
+  
+  # build matrix for heatmap cells x gene set sum or mean 
+  heatmap_mtx <- classification_df |>
+    dplyr::select(barcodes, gene_exp_columns) |> 
+    tibble::column_to_rownames("barcodes") |> 
+    as.matrix()
+  colnames(heatmap_mtx) <- stringr::str_remove(colnames(heatmap_mtx), "_sum|mean-")
+  
+  # plot heatmap of marker genes 
+  plot_gene_heatmap(t(heatmap_mtx), 
+                    row_title = "",
+                    legend_title = "Marker gene \nexpression",
+                    annotation = annotation)
+  
+}
+
+# Density plots ----------------------------------------------------------------
+
+# This creates a faceted density plot where cell types present in the `annotation_column` 
+# are on the y-axis and the expression in `gene_exp_column` is on the x-axis 
+# this is mostly copied from the cellassign probability density plot in 
+# https://github.com/AlexsLemonade/scpca-nf/blob/f215046b3a9d9ddc50fac1a198a57e1bb813aeae/templates/qc_report/celltypes_supplemental_report.rmd#L759
+
+
+plot_density <- function(classification_df,
+                         gene_exp_column,
+                         annotation_column){
+  
+  # pull out gene set name to create the plot title 
+  geneset_name <- stringr::str_remove(gene_exp_column, "_sum|mean-")
+  
+  ggplot(classification_df) +
+    aes(x = !!sym(gene_exp_column)) + # marker gene set column 
+    geom_density(
+      fill = "grey65",
+      linewidth = 0.25
+    ) +
+    labs(
+      x = "Gene set expression",
+      y = "Cell type annotation",
+      title = geneset_name
+    ) +
+    scale_alpha_identity() +
+    facet_grid(
+      rows = vars(!!sym(annotation_column)),
+      switch = "y", # make sure labels are readable 
+      scales = "free_y"
+    ) +
+    theme(
+      strip.background = element_blank(),
+      strip.text.y.left = element_text(
+        angle = 0,
+        hjust = 1
+      ),
+      axis.text.y = element_blank(),
+      axis.ticks.y = element_blank(),
+      panel.grid.minor = element_blank(),
+      panel.grid.major.y = element_blank(),
+      panel.spacing = unit(0.02, "in")
+    )
+}

From faf19a7379adaf167c7333e7fbaa1ea14b74279b Mon Sep 17 00:00:00 2001
From: Ally Hawkins <ally.hawkins@ccdatalab.org>
Date: Mon, 22 Jul 2024 16:20:11 -0500
Subject: [PATCH 03/10] move umap function

---
 .../scripts/utils/tumor-validation-helpers.R  | 53 +++++++++++++++++--
 1 file changed, 50 insertions(+), 3 deletions(-)

diff --git a/analyses/cell-type-ewings/scripts/utils/tumor-validation-helpers.R b/analyses/cell-type-ewings/scripts/utils/tumor-validation-helpers.R
index b028fb7aa..84dda4b95 100644
--- a/analyses/cell-type-ewings/scripts/utils/tumor-validation-helpers.R
+++ b/analyses/cell-type-ewings/scripts/utils/tumor-validation-helpers.R
@@ -264,11 +264,12 @@ full_celltype_heatmap <- function(classification_df,
   heatmap_mtx <- classification_df |>
     dplyr::select(barcodes, gene_exp_columns) |> 
     tibble::column_to_rownames("barcodes") |> 
-    as.matrix()
-  colnames(heatmap_mtx) <- stringr::str_remove(colnames(heatmap_mtx), "_sum|mean-")
+    as.matrix() |> 
+    t()
+  rownames(heatmap_mtx) <- stringr::str_remove(rownames(heatmap_mtx), "_sum|mean-")
   
   # plot heatmap of marker genes 
-  plot_gene_heatmap(t(heatmap_mtx), 
+  plot_gene_heatmap(heatmap_mtx, 
                     row_title = "",
                     legend_title = "Marker gene \nexpression",
                     annotation = annotation)
@@ -320,3 +321,49 @@ plot_density <- function(classification_df,
       panel.spacing = unit(0.02, "in")
     )
 }
+
+
+# UMAPs ------------------------------------------------------------------------
+
+# creates a faceted UMAP where each panel shows the cell type of interest in color and 
+# all other cells in grey 
+# adapted from `faceted_umap()` function in `celltypes_qc.rmd` from `scpca-nf`
+# https://github.com/AlexsLemonade/scpca-nf/blob/main/templates/qc_report/celltypes_qc.rmd#L143
+
+plot_faceted_umap <- function(classification_df,
+                              annotation_column) {
+  
+  ggplot(classification_df, aes(x = UMAP1, y = UMAP2, color = {{ annotation_column }})) +
+    # set points for all "other" points
+    geom_point(
+      data = dplyr::select(
+        classification_df, - {{ annotation_column }}
+      ),
+      color = "gray80",
+      alpha = 0.5,
+      size = 0.1
+    ) +
+    # set points for desired cell type
+    geom_point(size = 0.1, alpha = 0.5) +
+    facet_wrap(
+      vars({{ annotation_column }}),
+      ncol = 3
+    ) +
+    scale_color_brewer(palette = "Dark2") +
+    # remove axis numbers and background grid
+    scale_x_continuous(labels = NULL, breaks = NULL) +
+    scale_y_continuous(labels = NULL, breaks = NULL) +
+    guides(
+      color = guide_legend(
+        title = "Cell type",
+        # more visible points in legend
+        override.aes = list(
+          alpha = 1,
+          size = 1.5
+        )
+      )) +
+    theme(
+      aspect.ratio = 1
+    )
+  
+}

From 37ef45c814d13452efc4b0d3d90c6f38a436a656 Mon Sep 17 00:00:00 2001
From: Ally Hawkins <ally.hawkins@ccdatalab.org>
Date: Mon, 22 Jul 2024 16:20:37 -0500
Subject: [PATCH 04/10] add in singler vs. aucell comparison

---
 .../singler-workflow/01-singler-results.Rmd   | 159 ++++++++----------
 1 file changed, 70 insertions(+), 89 deletions(-)

diff --git a/analyses/cell-type-ewings/template_notebooks/singler-workflow/01-singler-results.Rmd b/analyses/cell-type-ewings/template_notebooks/singler-workflow/01-singler-results.Rmd
index 5e86ea5e9..213cd926f 100644
--- a/analyses/cell-type-ewings/template_notebooks/singler-workflow/01-singler-results.Rmd
+++ b/analyses/cell-type-ewings/template_notebooks/singler-workflow/01-singler-results.Rmd
@@ -77,47 +77,43 @@ geneset_scores_df <- readr::read_tsv(params$geneset_scores_file)
 ## Prep for plots
 
 ```{r}
-# original library annotation 
-orig_tumor <- glue::glue("tumor-{params$library_id}")
-
 # generate classification df to use for plots 
 classification_df <- sce |> 
   scuttle::makePerCellDF(use.dimred = "UMAP") |> 
-  # replace UMAP.1 with UMAP1
-  dplyr::rename_with(
-    \(x) stringr::str_replace(x, "^UMAP\\.", "UMAP")
-  ) |>
-  # get rid of excess columns
-  dplyr::select(barcodes, UMAP1, UMAP2) |> 
+  # replace UMAP.1 with UMAP1 and get rid of excess columns
+  dplyr::select(barcodes, UMAP1 = UMAP.1, UMAP2 = UMAP.2) |>
   # join with previous annotations, singler results, and gene set scores 
   dplyr::left_join(singler_results_df, by = "barcodes") |> 
   dplyr::left_join(geneset_scores_df, by = "barcodes") |> 
   dplyr::mutate(
+    # set factor order for aucell 
+    aucell_annotation = forcats::fct_relevel(aucell_annotation, "tumor", after= 0), 
     # first remove other library IDs from tumor annotations
     # we only care about whether or not it was from the original library or a new library
-    singler_tumor_annotation = dplyr::case_when(
-      singler_tumor_annotation == orig_tumor ~ "tumor_original",
-      stringr::str_detect(singler_tumor_annotation, "tumor") ~ "tumor_new",
-      .default = singler_tumor_annotation
-    ),
+    singler_annotation = dplyr::case_when(
+      stringr::str_detect(singler_annotation, "tumor") ~ "tumor",
+      is.na(singler_annotation) ~ "unknown", # make sure to separate out unknown labels
+      .default = singler_annotation
+    ) |> 
+      forcats::fct_relevel("tumor", after = 0),
     # get the top cell types for plotting later 
-    singler_tumor_annotation_lumped = dplyr::if_else(is.na(singler_tumor_annotation), "Not labeled", singler_tumor_annotation) |>
+    singler_lumped = singler_annotation |>
           # there are warnings because we have some that have less than 7 cell types (ones that only did tumor/normal)
           # this is why we ignore warnings in this chunk 
           forcats::fct_lump_n(7, other_level = "All remaining cell types", ties.method = "first") |>
           forcats::fct_infreq() |>
           forcats::fct_relevel("All remaining cell types", after = Inf), 
-    # make a label that combines tumor_new with tumor_original
-    singler_combined_tumor = dplyr::if_else(
-      singler_tumor_annotation_lumped %in% c("tumor_original", "tumor_new"), 
-      "tumor", 
-      singler_tumor_annotation_lumped
-      ) |> 
-      forcats::fct_relevel("tumor", after = 0) |> 
-      forcats::fct_relevel("All remaining cell types", after = Inf),
     # make a label that is just tumor or normal
-    tumor_normal_classification = dplyr::if_else(singler_combined_tumor == "tumor", "tumor", "normal") |> 
-      forcats::fct_relevel("tumor", after = 0)
+    singler_tumor_normal = dplyr::if_else(!(singler_annotation %in% c("tumor", "unknown")), "normal", singler_annotation) |> 
+      forcats::fct_relevel("tumor", after = 0), 
+    # get the consensus between aucell and singler 
+    consensus = dplyr::case_when(
+      singler_annotation == "tumor" & aucell_annotation == "tumor" ~ "tumor_both",
+      singler_annotation == "tumor" & aucell_annotation != "tumor" ~ "tumor_singler",
+      singler_annotation != "tumor" & aucell_annotation == "tumor" ~ "tumor_aucell",
+      .default = "normal"
+    ) |> 
+      forcats::fct_relevel("normal", after = Inf)
   )
 ```
 
@@ -141,7 +137,7 @@ gene_exp_df <- cell_types |>
 
 # join sum expression columns with classification df 
 classification_df <- classification_df |> 
-  dplyr::left_join(gene_exp_df)
+  dplyr::left_join(gene_exp_df, by = "barcodes")
 ```
 
 ```{r}
@@ -150,56 +146,44 @@ marker_gene_columns <- colnames(classification_df)[which(endsWith(colnames(class
 geneset_columns <- colnames(classification_df)[which(startsWith(colnames(classification_df), "mean-"))]
 ```
 
-## UMAP of all cell type annotations 
+## UMAPs of all cell type annotations 
 
 Here we show the annotated cell types on a UMAP. 
 The top 7 cell types are shown for each reference used and all other cells are lumped together. 
 
 ```{r, fig.height=8, fig.width=8}
-ggplot(classification_df, aes(x = UMAP1, y = UMAP2, color = singler_tumor_annotation_lumped)) +
-  # set points for all "other" points
-  geom_point(
-    data = dplyr::select(
-      classification_df, -singler_tumor_annotation_lumped
-    ),
-    color = "gray80",
-    alpha = 0.5,
-    size = 0.1
-  ) +
-  # set points for desired cell type
-  geom_point(size = 0.1, alpha = 0.5) +
-  facet_wrap(
-    vars(singler_tumor_annotation_lumped),
-    ncol = 3
-  ) +
-  scale_color_brewer(palette = "Dark2") +
-  # remove axis numbers and background grid
-  scale_x_continuous(labels = NULL, breaks = NULL) +
-  scale_y_continuous(labels = NULL, breaks = NULL) +
-  guides(
-    color = guide_legend(
-      title = "Cell type",
-      # more visible points in legend
-      override.aes = list(
-        alpha = 1,
-        size = 1.5
-      )
-    )) +
-  theme(
-        legend.position = c(0.67, 0.33),
-        legend.justification = c("left", "top"),
-        legend.title.align = 0.5,
-        # use slightly smaller legend text, which helps legend fit and prevents
-        #  long wrapped labels from bunching up
-        legend.text = element_text(size = rel(0.85)),
-        legend.key.height = unit(0.75, "cm"),
-        aspect.ratio = 1
-      )
+plot_faceted_umap(classification_df, singler_lumped)
 ```
 
-## Evaluate output from `SingleR`
 
-The next three sections will evaluate the annotations by looking at expression of marker genes and gene set scores. 
+## Comparison between `SingleR` and `AUCell`
+
+Below we show the cells that are annotated as tumor by both methods, just `AUCell` and just `SingleR` along with all normal cells. 
+
+```{r}
+plot_faceted_umap(classification_df, consensus)
+```
+
+We then calculate the confusion matrix between tumor and normal cells called in `SingleR` and `AUCell`. 
+The rows here correspond to the `AUCell` annotation and columns are from `SingleR`. 
+
+```{r}
+caret_df <- classification_df |> 
+  dplyr::filter(singler_tumor_normal != "unknown") |> 
+  dplyr::mutate(singler_tumor_normal = droplevels(singler_tumor_normal, "unknown"))
+
+caret::confusionMatrix(
+  table(
+    caret_df$aucell_annotation,
+    caret_df$singler_tumor_normal
+  )
+)
+```
+
+
+## Validate output from `SingleR`
+
+The next three sections will evaluate the annotations output obtained from using `SingleR` by looking at expression of marker genes and gene set scores. 
 
 The gene expression for all marker genes for tumor cells, endothelial cells, mesenchymal-like cells, and immune cells was summed together for each cell. 
 Marker genes were obtained from [Visser et al.](https://doi.org/10.1158/2767-9764.CRC-23-0027). 
@@ -222,25 +206,25 @@ Each row is a cell type and the expression or score is plotted on the x-axis.
 
 ### Tumor vs. Normal 
 
-In this section we show just the cells that are considered tumor (including both `tumor_original` and `tumor_new`) and normal, lumping all non-tumor cell types together. 
+In this section we show just the cells that are considered tumor and normal, lumping all non-tumor cell types together. 
 
 **Marker gene expression**
 
 ```{r}
-full_celltype_heatmap(classification_df, marker_gene_columns, "tumor_normal_classification")
+full_celltype_heatmap(classification_df, marker_gene_columns, "singler_tumor_normal")
 ```
 
 
 ```{r}
 plot_density(classification_df,
              "tumor_sum",
-             "tumor_normal_classification")
+             "singler_tumor_normal")
 ```
 
 **Gene set scores**
 
 ```{r}
-full_celltype_heatmap(classification_df, geneset_columns, "tumor_normal_classification")
+full_celltype_heatmap(classification_df, geneset_columns, "singler_tumor_normal")
 ```
 
 ```{r}
@@ -248,19 +232,19 @@ geneset_columns |>
   purrr::map(\(column){
     plot_density(classification_df,
                  column,
-                 "tumor_normal_classification")
+                 "singler_tumor_normal")
   })
 ```
 
 
 ### Tumor vs. all other cell types 
 
-In this section we show all tumor cells together (including both `tumor_original` and `tumor_new`) and the top 5 most represented normal cell types.
+In this section we show all tumor cells and the top 5 most represented normal cell types.
 
 **Marker gene expression** 
 
 ```{r}
-full_celltype_heatmap(classification_df, marker_gene_columns, "singler_combined_tumor")
+full_celltype_heatmap(classification_df, marker_gene_columns, "singler_lumped")
 ```
 
 ```{r}
@@ -268,7 +252,7 @@ marker_gene_columns |>
   purrr::map(\(column){
     plot_density(classification_df,
                  column,
-                 "singler_combined_tumor")
+                 "singler_lumped")
   })
 ```
 
@@ -276,7 +260,7 @@ marker_gene_columns |>
 **Gene set scores**
 
 ```{r}
-full_celltype_heatmap(classification_df, geneset_columns, "singler_combined_tumor")
+full_celltype_heatmap(classification_df, geneset_columns, "singler_lumped")
 ```
 
 ```{r}
@@ -284,35 +268,32 @@ geneset_columns |>
   purrr::map(\(column){
     plot_density(classification_df,
                  column,
-                 "singler_combined_tumor")
+                 "singler_lumped")
   })
 ```
 
-### Compare newly annotated tumor cells to existing tumor cells
+### Tumor cells annotatd by `SingleR` and `AUCell`
 
-In this section we show both tumor cell types, `tumor_original` and `tumor_new`, separately and the top 5 most represented normal cell types.
+Here we compare the marker gene expression and gene set scores for cells annotated as tumor by both methods, just `SingleR`, or just `AUCell` and compare to all normal cells. 
 
 **Marker gene expression**
 
 ```{r}
-full_celltype_heatmap(classification_df, marker_gene_columns, "singler_tumor_annotation_lumped")
+full_celltype_heatmap(classification_df, marker_gene_columns, "consensus")
 ```
 
 
 ```{r}
-marker_gene_columns |> 
-  purrr::map(\(column){
-    plot_density(classification_df,
-                 column,
-                 "singler_tumor_annotation_lumped")
-  })
+plot_density(classification_df,
+             "tumor_sum",
+             "consensus")
 ```
 
 
 **Gene set scores**
 
 ```{r}
-full_celltype_heatmap(classification_df, geneset_columns, "singler_tumor_annotation_lumped")
+full_celltype_heatmap(classification_df, geneset_columns, "consensus")
 ```
 
 ```{r}
@@ -320,7 +301,7 @@ geneset_columns |>
   purrr::map(\(column){
     plot_density(classification_df,
                  column,
-                 "singler_tumor_annotation_lumped")
+                 "consensus")
   })
 ```
 

From da444353fa5cbad45d0fa45f16700956cbb1c9d9 Mon Sep 17 00:00:00 2001
From: Ally Hawkins <ally.hawkins@ccdatalab.org>
Date: Mon, 22 Jul 2024 16:22:07 -0500
Subject: [PATCH 05/10] some comment updates

---
 .../singler-workflow/01-singler-results.Rmd                   | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/analyses/cell-type-ewings/template_notebooks/singler-workflow/01-singler-results.Rmd b/analyses/cell-type-ewings/template_notebooks/singler-workflow/01-singler-results.Rmd
index 213cd926f..cc5bd6909 100644
--- a/analyses/cell-type-ewings/template_notebooks/singler-workflow/01-singler-results.Rmd
+++ b/analyses/cell-type-ewings/template_notebooks/singler-workflow/01-singler-results.Rmd
@@ -18,10 +18,8 @@ params:
 This report summarizes the classification of `r params$library_id` using the following approach: 
 
 - Tumor cells were identified by using `AUCell`. 
-These cells are labeled as `tumor_original`. 
-- All other cells were labeled using `SingleR` with a combination of three references: 
+- All cells were labeled using `SingleR` with a combination of three references: 
   - All tumor cells annotated by `AUCell` for all other libraries in `SCPCP000015`. 
-  Any cells that are labeled as tumor using this reference are labeled as `tumor_new`. 
   - `BlueprintEncodeData` from `celldex`
   - `HumanPrimaryCellAtlasData` from `celldex`
 

From cb1615caa5e8fc5e2addf7cd83e2b90ff2e427e7 Mon Sep 17 00:00:00 2001
From: "Stephanie J. Spielman" <stephanie.spielman@gmail.com>
Date: Tue, 23 Jul 2024 12:40:49 -0400
Subject: [PATCH 06/10] update comments to trigger styler

---
 .../scripts/00_format-benchmark-data.R                |  4 ++--
 .../doublet-detection/scripts/01a_run-scdblfinder.R   | 11 ++++-------
 2 files changed, 6 insertions(+), 9 deletions(-)

diff --git a/analyses/doublet-detection/scripts/00_format-benchmark-data.R b/analyses/doublet-detection/scripts/00_format-benchmark-data.R
index 8f3d15825..2563630a1 100755
--- a/analyses/doublet-detection/scripts/00_format-benchmark-data.R
+++ b/analyses/doublet-detection/scripts/00_format-benchmark-data.R
@@ -9,8 +9,8 @@
 # For each dataset, we normalize counts and calculate PCA, but we perform no additional filtering.
 # The exported files are named `<dataset name>-<sce/anndata>.<rds/h5ad>.
 # Each has a variable `ground_truth_doublets` representing the singlet/doublet calls:
-#   - In the SCE file, this is in the colData slot
-#   - In the AnnData file, this is in the obs slot
+#   - In the SCE file, this is in the colData slot.
+#   - In the AnnData file, this is in the obs slot.
 
 # Load libraries
 suppressPackageStartupMessages({
diff --git a/analyses/doublet-detection/scripts/01a_run-scdblfinder.R b/analyses/doublet-detection/scripts/01a_run-scdblfinder.R
index a53de002c..fbe857551 100755
--- a/analyses/doublet-detection/scripts/01a_run-scdblfinder.R
+++ b/analyses/doublet-detection/scripts/01a_run-scdblfinder.R
@@ -1,7 +1,7 @@
 #!/usr/bin/env Rscript
 
 # This script runs scDblFinder on an SCE file and exports a TSV file of results.
-# If the SCE has fewer than 10 droplets, results will not be calculated and the exported TSV will contain NA values.
+# If the SCE has fewer than 10 droplets, results will not be calculated; the exported TSV will contain only NA values.
 
 # Load libraries ------
 suppressPackageStartupMessages({
@@ -28,7 +28,6 @@ run_scdblfinder <- function(sce,
                             random_seed = NULL,
                             columns_to_keep = c("barcodes", "score", "class"),
                             ...) {
-
   # first check multiple samples, which requires a random seed
   if (!is.null(sample_var) && is.null(random_seed)) {
     if (!sample_var %in% colnames(colData(sce))) {
@@ -113,13 +112,13 @@ opts <- parse_args(OptionParser(option_list = option_list))
 
 # Check input arguments and set up files, directories ------
 if (!file.exists(opts$input_sce_file)) {
-    glue::glue("Could not find input SCE file, expected at: `{opts$input_sce_file}`.")
+  glue::glue("Could not find input SCE file, expected at: `{opts$input_sce_file}`.")
 }
 if (!stringr::str_ends(opts$output_file, ".tsv")) {
   stop("The output TSV file must end in .tsv.")
 }
 
-fs::dir_create( dirname(opts$output_file) ) # create output directory as needed
+fs::dir_create(dirname(opts$output_file)) # create output directory as needed
 set.seed(opts$random_seed)
 
 # Detect doublets and export TSV file with inferences -----
@@ -146,9 +145,7 @@ if (ncells < cell_threshold) {
     na_df$cxds_score <- NA
   }
   readr::write_tsv(na_df, opts$output_file)
-
 } else {
-
   keep_columns <- c(
     "barcodes",
     "score",
@@ -166,5 +163,5 @@ if (ncells < cell_threshold) {
     random_seed = opts$random_seed,
     columns_to_keep = keep_columns
   ) |>
-  readr::write_tsv(opts$output_file)
+    readr::write_tsv(opts$output_file)
 }

From 0c4d42ea4cbe8a252cf525ec7dcc737fbf29235c Mon Sep 17 00:00:00 2001
From: "Stephanie J. Spielman" <stephanie.spielman@gmail.com>
Date: Tue, 23 Jul 2024 12:41:14 -0400
Subject: [PATCH 07/10] update comments to trigger styler

---
 .../benchmark-test-data/generate-benchmark-test-data.R         | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/analyses/doublet-detection/benchmark-test-data/generate-benchmark-test-data.R b/analyses/doublet-detection/benchmark-test-data/generate-benchmark-test-data.R
index 1d0946086..8236f8faa 100755
--- a/analyses/doublet-detection/benchmark-test-data/generate-benchmark-test-data.R
+++ b/analyses/doublet-detection/benchmark-test-data/generate-benchmark-test-data.R
@@ -2,7 +2,8 @@
 
 # This script generates test data files for use when testing the benchmarking script `../run_doublet-detection-benchmark.sh`.
 # Saved in `./data.zip`, the test data files are subsetted version of the datasets used for benchmarking, as described in `../README.md`.
-# This script assumes that ../scratch/benchmark-datasets/raw contains the .rds files from <https://doi.org/10.5281/zenodo.4562782>'s `real_datasets.zip` file and that there is an available `zip` executable on the system.
+# This script assumes that ../scratch/benchmark-datasets/raw contains the .rds files from <https://doi.org/10.5281/zenodo.4562782>'s `real_datasets.zip` file,
+#  and that there is an available `zip` executable on the system.
 
 library(Matrix)
 set.seed(2024)

From 143be65ad769ba2c0b95e4d880f40dd4c2150db8 Mon Sep 17 00:00:00 2001
From: "Stephanie J. Spielman" <stephanie.spielman@gmail.com>
Date: Tue, 23 Jul 2024 12:42:44 -0400
Subject: [PATCH 08/10] spacing to trigger styling

---
 .../03_compare-benchmark-results.Rmd          | 222 +++++++++---------
 .../02_explore-benchmark-results.Rmd          | 124 +++++-----
 2 files changed, 175 insertions(+), 171 deletions(-)

diff --git a/analyses/doublet-detection/exploratory-notebooks/03_compare-benchmark-results.Rmd b/analyses/doublet-detection/exploratory-notebooks/03_compare-benchmark-results.Rmd
index cfa6b277a..9b578b15c 100644
--- a/analyses/doublet-detection/exploratory-notebooks/03_compare-benchmark-results.Rmd
+++ b/analyses/doublet-detection/exploratory-notebooks/03_compare-benchmark-results.Rmd
@@ -28,7 +28,6 @@ This notebook uses a doublet threshold of `>=0.5` for `cxds`, which may not be u
 
 ### Packages
 
-
 ```{r packages}
 suppressPackageStartupMessages({
   library(SingleCellExperiment)
@@ -43,7 +42,7 @@ theme_set(
     theme(
       legend.title = element_text(size = rel(1.1)),
       legend.text = element_text(size = rel(1.1))
-  )
+    )
 )
 
 # define threshold used to call cxds
@@ -74,7 +73,7 @@ plot_pca_calls <- function(df,
     aes(
       x = PC1,
       y = PC2,
-      color = {{color_column}}
+      color = {{ color_column }}
     ) +
     geom_point(
       size = 0.75,
@@ -87,9 +86,10 @@ plot_pca_calls <- function(df,
     plot <- plot +
       scale_color_manual(
         name = "",
-        values = c("doublet" = "black", "singlet" = "gray90")) +
+        values = c("doublet" = "black", "singlet" = "gray90")
+      ) +
       geom_point(
-        data = dplyr::filter(df, {{color_column}} == "doublet"),
+        data = dplyr::filter(df, {{ color_column }} == "doublet"),
         color = "black",
         size = 0.75
       )
@@ -97,9 +97,11 @@ plot_pca_calls <- function(df,
     plot <- plot +
       scale_color_manual(
         name = "Consensus",
-        values = c("ambiguous" = "yellow",
-                   "doublet" = "black",
-                   "singlet" = "gray90")
+        values = c(
+          "ambiguous" = "yellow",
+          "doublet" = "black",
+          "singlet" = "gray90"
+        )
       ) +
       geom_point(
         data = dplyr::filter(df, consensus_class != "singlet"),
@@ -128,20 +130,22 @@ plot_pca_metrics <- function(df, color_column) {
   )
 
   ggplot(df) +
-    aes(x = PC1,
-        y = PC2,
-        color = {{color_column}}) +
-  geom_point(
-    size = 0.75,
-    alpha = 0.6
-  ) +
-  geom_point(
-    data = dplyr::filter(df, {{color_column}} %in% c("fp", "fn")),
-    size = 0.75
-  ) +
-  scale_color_manual(name = "Call type", values = metric_colors) +
-  ggtitle("Consensus call metrics") +
-  guides(color = guide_legend(override.aes = list(size = 2)))
+    aes(
+      x = PC1,
+      y = PC2,
+      color = {{ color_column }}
+    ) +
+    geom_point(
+      size = 0.75,
+      alpha = 0.6
+    ) +
+    geom_point(
+      data = dplyr::filter(df, {{ color_column }} %in% c("fp", "fn")),
+      size = 0.75
+    ) +
+    scale_color_manual(name = "Call type", values = metric_colors) +
+    ggtitle("Consensus call metrics") +
+    guides(color = guide_legend(override.aes = list(size = 2)))
 }
 ```
 
@@ -161,7 +165,6 @@ dataset_names <- stringr::str_split_1(params$datasets, pattern = " ")
 doublet_df_list <- dataset_names |>
   purrr::map(
     \(dataset) {
-
       scdbl_tsv <- file.path(result_dir, glue::glue("{dataset}_scdblfinder.tsv"))
       scrub_tsv <- file.path(result_dir, glue::glue("{dataset}_scrublet.tsv"))
       sce_file <- file.path(data_dir, dataset, glue::glue("{dataset}.rds"))
@@ -172,7 +175,7 @@ doublet_df_list <- dataset_names |>
           barcodes,
           cxds_score,
           scdbl_score = score,
-          scdbl_prediction  = class
+          scdbl_prediction = class
         ) |>
         # add cxds calls at `cxds_threshold` threshold
         dplyr::mutate(
@@ -191,9 +194,10 @@ doublet_df_list <- dataset_names |>
       scuttle::makePerCellDF(sce, use.dimred = "PCA") |>
         tibble::rownames_to_column(var = "barcodes") |>
         dplyr::select(barcodes,
-                      ground_truth = ground_truth_doublets,
-                      PC1 = PCA.1,
-                      PC2 = PCA.2) |>
+          ground_truth = ground_truth_doublets,
+          PC1 = PCA.1,
+          PC2 = PCA.2
+        ) |>
         dplyr::left_join(
           scrub_df,
           by = "barcodes"
@@ -217,7 +221,6 @@ This section contains upset plots that show overlap across doublet calls from ea
 doublet_df_list |>
   purrr::iwalk(
     \(df, dataset) {
-
       doublet_barcodes <- list(
         "scDblFinder" = df$barcodes[df$scdbl_prediction == "doublet"],
         "scrublet"    = df$barcodes[df$scrublet_prediction == "doublet"],
@@ -229,12 +232,10 @@ doublet_df_list |>
         dataset,
         x = 0.65,
         y = 0.95,
-        gp = grid::gpar(fontsize=16)
+        gp = grid::gpar(fontsize = 16)
       )
-
     }
   )
-
 ```
 
 
@@ -251,10 +252,8 @@ To this end, we'll create some new columns for each dataset:
 doublet_df_list <- doublet_df_list |>
   purrr::map(
     \(dataset_df) {
-
       dataset_df |>
         dplyr::rowwise() |>
-
         dplyr::mutate(
           # Add column `consensus_call`
           # "doublet" if all three methods are doublet, and "singlet" otherwise
@@ -280,7 +279,7 @@ doublet_df_list <- doublet_df_list |>
             consensus_call == "singlet" && ground_truth == "doublet" ~ "fn"
           )
         )
-      }
+    }
   )
 ```
 
@@ -306,15 +305,16 @@ This section plots the PCA for each dataset, clockwise from the top left:
   - False negative: red
 
 ```{r, fig.width = 12, fig.height = 9}
-pc_color_columns <- c("scDblFinder prediction" = "scdbl_prediction",
-                      "scrublet prediction" = "scrublet_prediction",
-                      "cxds prediction" = "cxds_prediction",
-                      "Ground truth" = "ground_truth",
-                      "Consensus call" = "consensus_call")
+pc_color_columns <- c(
+  "scDblFinder prediction" = "scdbl_prediction",
+  "scrublet prediction" = "scrublet_prediction",
+  "cxds prediction" = "cxds_prediction",
+  "Ground truth" = "ground_truth",
+  "Consensus call" = "consensus_call"
+)
 doublet_df_list |>
   purrr::imap(
     \(df, dataset) {
-
       pc_plot_list <- pc_color_columns |>
         purrr::imap(
           \(color_column, plot_title) {
@@ -323,7 +323,8 @@ doublet_df_list |>
               color_column = !!sym(color_column),
               plot_type = "calls",
               title = plot_title
-            ) + theme(legend.position = "none")}
+            ) + theme(legend.position = "none")
+          }
         )
 
       # plot for consensus_class, which needs three colors
@@ -343,8 +344,9 @@ doublet_df_list |>
 
       patchwork::wrap_plots(pc_plot_list) +
         plot_annotation(
-            glue::glue("PCA for {dataset}"),
-            theme = theme(plot.title = element_text(size = 16))) +
+          glue::glue("PCA for {dataset}"),
+          theme = theme(plot.title = element_text(size = 16))
+        ) +
         plot_layout(guides = "collect")
     }
   )
@@ -359,35 +361,35 @@ This section shows a table of the consensus class counts and calculates a confus
 metric_df <- doublet_df_list |>
   purrr::imap(
     \(df, dataset) {
-        print(glue::glue("======================== {dataset} ========================"))
-
-        cat("Table of consensus class counts:")
-        print(table(df$consensus_class))
-
-        cat("\n\n")
-
-        # run confusion matrix only if there are both singlets and doublets
-        if ("singlet" %in% df$consensus_call & "doublet" %in% df$consensus_call) {
-            confusion_result <- caret::confusionMatrix(
-            # truth should be first
-            table(
-              "Truth" = df$ground_truth,
-              "Consensus prediction" = df$consensus_call
-            ),
-            positive = "doublet"
-          )
-  
-          print(confusion_result)
-  
-          # Extract information we want to present later in a table
-          return(
-            tibble::tibble(
-              "Dataset name" = dataset,
-              "Kappa" = round(confusion_result$overall["Kappa"], 3),
-              "Balanced accuracy" = round(confusion_result$byClass["Balanced Accuracy"], 3)
-            ) 
+      print(glue::glue("======================== {dataset} ========================"))
+
+      cat("Table of consensus class counts:")
+      print(table(df$consensus_class))
+
+      cat("\n\n")
+
+      # run confusion matrix only if there are both singlets and doublets
+      if ("singlet" %in% df$consensus_call & "doublet" %in% df$consensus_call) {
+        confusion_result <- caret::confusionMatrix(
+          # truth should be first
+          table(
+            "Truth" = df$ground_truth,
+            "Consensus prediction" = df$consensus_call
+          ),
+          positive = "doublet"
+        )
+
+        print(confusion_result)
+
+        # Extract information we want to present later in a table
+        return(
+          tibble::tibble(
+            "Dataset name" = dataset,
+            "Kappa" = round(confusion_result$overall["Kappa"], 3),
+            "Balanced accuracy" = round(confusion_result$byClass["Balanced Accuracy"], 3)
           )
-        }
+        )
+      }
     }
   ) |>
   dplyr::bind_rows()
@@ -426,7 +428,6 @@ Since there are only two methods here, we will create two columns for this data:
 doublet_df_list <- doublet_df_list |>
   purrr::map(
     \(dataset_df) {
-
       dataset_df <- dataset_df |>
         dplyr::rowwise() |>
         # Update column `consensus_call`
@@ -444,7 +445,7 @@ doublet_df_list <- doublet_df_list |>
             all(c(scdbl_prediction, cxds_prediction) == "singlet") ~ "singlet",
             .default = "ambiguous"
           )
-         ) |>
+        ) |>
         # Add Update `confusion_call`
         dplyr::mutate(
           confusion_call = dplyr::case_when(
@@ -475,15 +476,16 @@ This section plots the PCA for each dataset, clockwise from the top left:
 
 
 ```{r, fig.width = 12, fig.height = 6}
-pc_color_columns <- c("scDblFinder prediction" = "scdbl_prediction",
-                      "cxds prediction" = "cxds_prediction",
-                      "Ground truth" = "ground_truth",
-                      "Consensus call" = "consensus_call")
+pc_color_columns <- c(
+  "scDblFinder prediction" = "scdbl_prediction",
+  "cxds prediction" = "cxds_prediction",
+  "Ground truth" = "ground_truth",
+  "Consensus call" = "consensus_call"
+)
 
 doublet_df_list |>
   purrr::imap(
     \(df, dataset) {
-
       pc_plot_list <- pc_color_columns |>
         purrr::imap(
           \(color_column, plot_title) {
@@ -492,7 +494,8 @@ doublet_df_list |>
               color_column = !!sym(color_column),
               plot_type = "calls",
               title = plot_title
-            ) + theme(legend.position = "none")}
+            ) + theme(legend.position = "none")
+          }
         )
       # plot for consensus_class, which needs three colors
       # note that the list name isn't actually used here
@@ -511,8 +514,9 @@ doublet_df_list |>
 
       patchwork::wrap_plots(pc_plot_list) +
         plot_annotation(
-            glue::glue("PCA for {dataset}"),
-            theme = theme(plot.title = element_text(size = 16))) +
+          glue::glue("PCA for {dataset}"),
+          theme = theme(plot.title = element_text(size = 16))
+        ) +
         plot_layout(guides = "collect")
     }
   )
@@ -527,35 +531,35 @@ This section shows a table of the consensus class counts and calculates a confus
 metric_df <- doublet_df_list |>
   purrr::imap(
     \(df, dataset) {
-        print(glue::glue("======================== {dataset} ========================"))
-
-        cat("Table of consensus class counts:")
-        print(table(df$consensus_class))
-
-        cat("\n\n")
-
-        # run confusion matrix only if there are both singlets and doublets
-        if ("singlet" %in% df$consensus_call & "doublet" %in% df$consensus_call) {
-          confusion_result <- caret::confusionMatrix(
-            # truth should be first
-            table(
-              "Truth" = df$ground_truth,
-              "Consensus prediction" = df$consensus_call
-            ),
-            positive = "doublet"
-          )
-  
-          print(confusion_result)
-  
-          # Extract information we want to present later in a table
-          return(
-            tibble::tibble(
-              "Dataset name" = dataset,
-              "Kappa" = round(confusion_result$overall["Kappa"], 3),
-              "Balanced accuracy" = round(confusion_result$byClass["Balanced Accuracy"], 3)
-            )
+      print(glue::glue("======================== {dataset} ========================"))
+
+      cat("Table of consensus class counts:")
+      print(table(df$consensus_class))
+
+      cat("\n\n")
+
+      # run confusion matrix only if there are both singlets and doublets
+      if ("singlet" %in% df$consensus_call & "doublet" %in% df$consensus_call) {
+        confusion_result <- caret::confusionMatrix(
+          # truth should be first
+          table(
+            "Truth" = df$ground_truth,
+            "Consensus prediction" = df$consensus_call
+          ),
+          positive = "doublet"
+        )
+
+        print(confusion_result)
+
+        # Extract information we want to present later in a table
+        return(
+          tibble::tibble(
+            "Dataset name" = dataset,
+            "Kappa" = round(confusion_result$overall["Kappa"], 3),
+            "Balanced accuracy" = round(confusion_result$byClass["Balanced Accuracy"], 3)
           )
-        }
+        )
+      }
     }
   ) |>
   dplyr::bind_rows()
diff --git a/analyses/doublet-detection/template-notebooks/02_explore-benchmark-results.Rmd b/analyses/doublet-detection/template-notebooks/02_explore-benchmark-results.Rmd
index 9cdf52806..44ae121b4 100644
--- a/analyses/doublet-detection/template-notebooks/02_explore-benchmark-results.Rmd
+++ b/analyses/doublet-detection/template-notebooks/02_explore-benchmark-results.Rmd
@@ -47,8 +47,6 @@ data_desc |>
   print()
 ```
 
-
-
 ## Setup
 
 ### Packages
@@ -97,7 +95,6 @@ sce_file <- file.path(
   data_dir,
   glue::glue("{params$dataset}.rds")
 )
-
 ```
 
 ### Functions
@@ -111,8 +108,8 @@ print_doublet_table <- function(df, pred_col, is_ground_truth = FALSE) {
   # df is expected to contain `pred_col`, should should be provided as a string
 
   # If there are no predicted doublets, make sure the tab has a 0 count
-  dub_tab <- table( factor(df[[pred_col]], levels = c("singlet", "doublet") ))
-  dub_percent <- round(dub_tab[["doublet"]]/sum(dub_tab), 4) * 100
+  dub_tab <- table(factor(df[[pred_col]], levels = c("singlet", "doublet")))
+  dub_percent <- round(dub_tab[["doublet"]] / sum(dub_tab), 4) * 100
 
   if (is_ground_truth) {
     output <- glue::glue("{dub_percent}% of droplets in this dataset are doublets.")
@@ -138,31 +135,33 @@ plot_pca <- function(df,
   # By default, this function assumes a discrete color variable (`color_type = "d"`) with two levels. Use `color_type = "c"` for continuous.
 
   p <- ggplot(df) +
-    aes(x = PC1,
-        y = PC2,
-        color = {{color_column}}) +
-  geom_point(alpha = 0.5) +
-  labs(color = color_lab) +
-  theme(
-    legend.title.position = "top",
-    legend.position = "bottom"
-  )
+    aes(
+      x = PC1,
+      y = PC2,
+      color = {{ color_column }}
+    ) +
+    geom_point(alpha = 0.5) +
+    labs(color = color_lab) +
+    theme(
+      legend.title.position = "top",
+      legend.position = "bottom"
+    )
 
   # Set the palette, and ensure doublets are on top for visibility
   if (color_type == "d") {
     p <- p +
       scale_color_manual(values = c("black", "lightblue")) +
-    geom_point(
-      data = dplyr::filter(df, {{color_column}} == "doublet"),
-      color = "black"
-    )
+      geom_point(
+        data = dplyr::filter(df, {{ color_column }} == "doublet"),
+        color = "black"
+      )
   } else if (color_type == "c") {
     p <- p +
       scale_color_viridis_c(direction = -1) +
-    geom_point(
-      data = dplyr::filter(df, {{pred_column}} == "doublet"),
-      aes(color = {{color_column}})
-    )
+      geom_point(
+        data = dplyr::filter(df, {{ pred_column }} == "doublet"),
+        aes(color = {{ color_column }})
+      )
   }
 
   return(p)
@@ -170,14 +169,16 @@ plot_pca <- function(df,
 
 
 
-plot_jitter_scores <- function(df, score_col, pred_col, method_name){
+plot_jitter_scores <- function(df, score_col, pred_col, method_name) {
   # Plot jitter plot of score distributions and predictions, colored by ground truth
   # df is expected to contain columns `ground_truth`, `score_col`, and `pred_col`
   # The column arguments should _not_ be provided as strings
   ggplot(df) +
-    aes(x = ground_truth,
-        y = {{score_col}},
-        color = {{pred_col}}) +
+    aes(
+      x = ground_truth,
+      y = {{ score_col }},
+      color = {{ pred_col }}
+    ) +
     geom_jitter(
       alpha = 0.2,
       width = 0.1
@@ -187,29 +188,31 @@ plot_jitter_scores <- function(df, score_col, pred_col, method_name){
       y = glue::glue("{method_name} score"),
       color = glue::glue("{method_name} prediction")
     ) +
-  theme(
-    legend.title.position = "top",
-    legend.position = "bottom"
-  )
+    theme(
+      legend.title.position = "top",
+      legend.position = "bottom"
+    )
 }
 
 
-plot_density_scores <- function(df, score_col, method_name){
+plot_density_scores <- function(df, score_col, method_name) {
   # Plot density plot of score distributions, colored by ground truth
   # df is expected to contain columns `ground_truth` and `score_col`
   # The column arguments should _not_ be provided as strings
   ggplot(df) +
-    aes(x = {{score_col}},
-        fill = ground_truth) +
+    aes(
+      x = {{ score_col }},
+      fill = ground_truth
+    ) +
     geom_density(alpha = 0.7) +
     labs(
       x = glue::glue("{method_name} score"),
       fill = "Ground truth call"
     ) +
-  theme(
-    legend.title.position = "top",
-    legend.position = "bottom"
-  )
+    theme(
+      legend.title.position = "top",
+      legend.position = "bottom"
+    )
 }
 
 assess_doublets <- function(df, pred_col) {
@@ -217,26 +220,24 @@ assess_doublets <- function(df, pred_col) {
   # The `pred_col` argument _should_ be provided as a strings
   # This function will return NULL if there are insufficient counts to calculate metrics
 
-    # truth should be first
-    confusion_table <- table(
-      "Truth" = df$ground_truth,
-      "Prediction" = df[[pred_col]]
-    )
+  # truth should be first
+  confusion_table <- table(
+    "Truth" = df$ground_truth,
+    "Prediction" = df[[pred_col]]
+  )
 
-    if (!all(dim(confusion_table) == c(2,2))) {
-      print("Metrics could not be calculated; insufficient counts.")
-      return(NULL)
-    } else {
-      return(
-        caret::confusionMatrix(
-          confusion_table,
-          positive = "doublet"
-        )
+  if (!all(dim(confusion_table) == c(2, 2))) {
+    print("Metrics could not be calculated; insufficient counts.")
+    return(NULL)
+  } else {
+    return(
+      caret::confusionMatrix(
+        confusion_table,
+        positive = "doublet"
       )
-    }
-
+    )
+  }
 }
-
 ```
 
 
@@ -250,7 +251,7 @@ scdbl_df <- read_tsv(scdbl_tsv) |>
     barcodes,
     cxds_score,
     scdbl_score = score,
-    scdbl_prediction  = class
+    scdbl_prediction = class
   ) |>
   # add cxds calls at three thresholds:
   # 0.5, 0.75, 0.9
@@ -279,9 +280,10 @@ sce <- read_rds(sce_file)
 sce_df <- scuttle::makePerCellDF(sce, use.dimred = "PCA") |>
   tibble::rownames_to_column(var = "barcodes") |>
   dplyr::select(barcodes,
-                ground_truth = ground_truth_doublets,
-                PC1 = PCA.1,
-                PC2 = PCA.2)
+    ground_truth = ground_truth_doublets,
+    PC1 = PCA.1,
+    PC2 = PCA.2
+  )
 
 # we can now remove the sce
 rm(sce)
@@ -360,7 +362,6 @@ plot_density_scores(
 #### PCA plots
 
 ```{r, eval = has_scdbl, fig.width = 8}
-
 d_plot <- plot_pca(
   doublet_df,
   color_column = scdbl_prediction,
@@ -486,7 +487,6 @@ plot_density_scores(
 #### PCA plots
 
 ```{r fig.width = 10}
-
 cxds_cols <- names(doublet_df)[grep("^cxds_prediction_", names(doublet_df))]
 
 prediction_pcas <- cxds_cols |>
@@ -499,9 +499,9 @@ prediction_pcas <- cxds_cols |>
         color_column = !!as.symbol(col_name),
         color_lab = glue::glue("cxds prediction at {threshold}")
       ) +
-      labs(title = glue::glue("cxds prediction at {threshold}"))
+        labs(title = glue::glue("cxds prediction at {threshold}"))
     }
-)
+  )
 wrap_plots(prediction_pcas)
 ```
 

From 162d4d5146b59616e4048772de18816fe1f280ce Mon Sep 17 00:00:00 2001
From: Stephanie Spielman <stephanie.spielman@gmail.com>
Date: Tue, 23 Jul 2024 12:57:19 -0400
Subject: [PATCH 09/10] Apply suggestions from code review

Co-authored-by: Joshua Shapiro <josh.shapiro@ccdatalab.org>
---
 .../exploratory-notebooks/03_compare-benchmark-results.Rmd     | 3 ++-
 .../template-notebooks/02_explore-benchmark-results.Rmd        | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/analyses/doublet-detection/exploratory-notebooks/03_compare-benchmark-results.Rmd b/analyses/doublet-detection/exploratory-notebooks/03_compare-benchmark-results.Rmd
index 9b578b15c..8c7d9b8bd 100644
--- a/analyses/doublet-detection/exploratory-notebooks/03_compare-benchmark-results.Rmd
+++ b/analyses/doublet-detection/exploratory-notebooks/03_compare-benchmark-results.Rmd
@@ -193,7 +193,8 @@ doublet_df_list <- dataset_names |>
 
       scuttle::makePerCellDF(sce, use.dimred = "PCA") |>
         tibble::rownames_to_column(var = "barcodes") |>
-        dplyr::select(barcodes,
+        dplyr::select(
+          barcodes,
           ground_truth = ground_truth_doublets,
           PC1 = PCA.1,
           PC2 = PCA.2
diff --git a/analyses/doublet-detection/template-notebooks/02_explore-benchmark-results.Rmd b/analyses/doublet-detection/template-notebooks/02_explore-benchmark-results.Rmd
index 44ae121b4..ca450806e 100644
--- a/analyses/doublet-detection/template-notebooks/02_explore-benchmark-results.Rmd
+++ b/analyses/doublet-detection/template-notebooks/02_explore-benchmark-results.Rmd
@@ -279,7 +279,8 @@ scrub_df <- read_tsv(scrub_tsv)
 sce <- read_rds(sce_file)
 sce_df <- scuttle::makePerCellDF(sce, use.dimred = "PCA") |>
   tibble::rownames_to_column(var = "barcodes") |>
-  dplyr::select(barcodes,
+  dplyr::select(
+    barcodes,
     ground_truth = ground_truth_doublets,
     PC1 = PCA.1,
     PC2 = PCA.2

From 3a6204096fdccc967bada16ce6f0a92b53dc590c Mon Sep 17 00:00:00 2001
From: Ally Hawkins <ally.hawkins@ccdatalab.org>
Date: Tue, 23 Jul 2024 12:37:16 -0500
Subject: [PATCH 10/10] use patchwork

---
 .../scripts/utils/tumor-validation-helpers.R  |  4 +++-
 .../singler-workflow/01-singler-results.Rmd   | 20 +++++++++++--------
 2 files changed, 15 insertions(+), 9 deletions(-)

diff --git a/analyses/cell-type-ewings/scripts/utils/tumor-validation-helpers.R b/analyses/cell-type-ewings/scripts/utils/tumor-validation-helpers.R
index 84dda4b95..9abfaabc6 100644
--- a/analyses/cell-type-ewings/scripts/utils/tumor-validation-helpers.R
+++ b/analyses/cell-type-ewings/scripts/utils/tumor-validation-helpers.R
@@ -291,7 +291,7 @@ plot_density <- function(classification_df,
   # pull out gene set name to create the plot title 
   geneset_name <- stringr::str_remove(gene_exp_column, "_sum|mean-")
   
-  ggplot(classification_df) +
+  plot <- ggplot(classification_df) +
     aes(x = !!sym(gene_exp_column)) + # marker gene set column 
     geom_density(
       fill = "grey65",
@@ -320,6 +320,8 @@ plot_density <- function(classification_df,
       panel.grid.major.y = element_blank(),
       panel.spacing = unit(0.02, "in")
     )
+  
+  return(plot)
 }
 
 
diff --git a/analyses/cell-type-ewings/template_notebooks/singler-workflow/01-singler-results.Rmd b/analyses/cell-type-ewings/template_notebooks/singler-workflow/01-singler-results.Rmd
index cc5bd6909..86cb6ed84 100644
--- a/analyses/cell-type-ewings/template_notebooks/singler-workflow/01-singler-results.Rmd
+++ b/analyses/cell-type-ewings/template_notebooks/singler-workflow/01-singler-results.Rmd
@@ -225,13 +225,14 @@ plot_density(classification_df,
 full_celltype_heatmap(classification_df, geneset_columns, "singler_tumor_normal")
 ```
 
-```{r}
+```{r, fig.height=10}
 geneset_columns |> 
   purrr::map(\(column){
     plot_density(classification_df,
                  column,
                  "singler_tumor_normal")
-  })
+  }) |> 
+  patchwork::wrap_plots(ncol = 1)
 ```
 
 
@@ -245,13 +246,14 @@ In this section we show all tumor cells and the top 5 most represented normal ce
 full_celltype_heatmap(classification_df, marker_gene_columns, "singler_lumped")
 ```
 
-```{r}
+```{r, fig.height=10}
 marker_gene_columns |> 
   purrr::map(\(column){
     plot_density(classification_df,
                  column,
                  "singler_lumped")
-  })
+  }) |> 
+  patchwork::wrap_plots(ncol = 1)
 ```
 
 
@@ -261,13 +263,14 @@ marker_gene_columns |>
 full_celltype_heatmap(classification_df, geneset_columns, "singler_lumped")
 ```
 
-```{r}
+```{r, fig.height=10}
 geneset_columns |> 
   purrr::map(\(column){
     plot_density(classification_df,
                  column,
                  "singler_lumped")
-  })
+  }) |> 
+  patchwork::wrap_plots(ncol = 1)
 ```
 
 ### Tumor cells annotatd by `SingleR` and `AUCell`
@@ -294,13 +297,14 @@ plot_density(classification_df,
 full_celltype_heatmap(classification_df, geneset_columns, "consensus")
 ```
 
-```{r}
+```{r, fig.height=10}
 geneset_columns |> 
   purrr::map(\(column){
     plot_density(classification_df,
                  column,
                  "consensus")
-  })
+  }) |> 
+  patchwork::wrap_plots(ncol = 1)
 ```