From 970a063bd968ea25ff6b051139a134a6fd8802a9 Mon Sep 17 00:00:00 2001 From: Ally Hawkins Date: Fri, 19 Jul 2024 14:48:42 -0500 Subject: [PATCH 01/10] template for evaluating singler --- .../singler-workflow/01-singler-results.Rmd | 335 ++++++++++++++++++ 1 file changed, 335 insertions(+) create mode 100644 analyses/cell-type-ewings/template_notebooks/singler-workflow/01-singler-results.Rmd diff --git a/analyses/cell-type-ewings/template_notebooks/singler-workflow/01-singler-results.Rmd b/analyses/cell-type-ewings/template_notebooks/singler-workflow/01-singler-results.Rmd new file mode 100644 index 000000000..5e86ea5e9 --- /dev/null +++ b/analyses/cell-type-ewings/template_notebooks/singler-workflow/01-singler-results.Rmd @@ -0,0 +1,335 @@ +--- +title: "Summary of tumor cell classification with `SingleR` for `r params$library_id`" +author: Ally Hawkins +date: "`r Sys.Date()`" +output: + html_document: + toc: true + toc_depth: 3 + code_folding: "hide" +params: + library_id: NULL + sce_file: NULL + singler_results_file: NULL + geneset_scores_file: NULL + marker_genes_file: NULL +--- + +This report summarizes the classification of `r params$library_id` using the following approach: + +- Tumor cells were identified by using `AUCell`. +These cells are labeled as `tumor_original`. +- All other cells were labeled using `SingleR` with a combination of three references: + - All tumor cells annotated by `AUCell` for all other libraries in `SCPCP000015`. + Any cells that are labeled as tumor using this reference are labeled as `tumor_new`. + - `BlueprintEncodeData` from `celldex` + - `HumanPrimaryCellAtlasData` from `celldex` + + +## Setup + +```{r} +# check that sce and results files exist +stopifnot( + "sce file does not exist" = file.exists(params$sce_file), + "SingleR results file does not exist" = file.exists(params$singler_results_file), + "Gene set scores file does not exist" = file.exists(params$geneset_scores_file), + "Marker genes file does not exist" = file.exists(params$marker_genes_file) +) +``` + + +```{r packages} +suppressPackageStartupMessages({ + # load required packages + library(SingleCellExperiment) + library(ggplot2) +}) + +# Set default ggplot theme +theme_set( + theme_bw() +) + +# quiet messages +options(readr.show_col_types = FALSE) +ComplexHeatmap::ht_opt(message = FALSE) +``` + + +```{r base paths} +# The path to this module +module_base <- rprojroot::find_root(rprojroot::is_renv_project) + +# source in helper functions: plot_gene_heatmap() and plot_cnv_heatmap() +# create_classification_df() and create_marker_gene_df() +validation_functions <- file.path(module_base, "scripts", "utils", "tumor-validation-helpers.R") +source(validation_functions) +``` + +```{r} +# Read in files +sce <- readr::read_rds(params$sce_file) +singler_results_df <- readr::read_tsv(params$singler_results_file) +geneset_scores_df <- readr::read_tsv(params$geneset_scores_file) +``` + +## Prep for plots + +```{r} +# original library annotation +orig_tumor <- glue::glue("tumor-{params$library_id}") + +# generate classification df to use for plots +classification_df <- sce |> + scuttle::makePerCellDF(use.dimred = "UMAP") |> + # replace UMAP.1 with UMAP1 + dplyr::rename_with( + \(x) stringr::str_replace(x, "^UMAP\\.", "UMAP") + ) |> + # get rid of excess columns + dplyr::select(barcodes, UMAP1, UMAP2) |> + # join with previous annotations, singler results, and gene set scores + dplyr::left_join(singler_results_df, by = "barcodes") |> + dplyr::left_join(geneset_scores_df, by = "barcodes") |> + dplyr::mutate( + # first remove other library IDs from tumor annotations + # we only care about whether or not it was from the original library or a new library + singler_tumor_annotation = dplyr::case_when( + singler_tumor_annotation == orig_tumor ~ "tumor_original", + stringr::str_detect(singler_tumor_annotation, "tumor") ~ "tumor_new", + .default = singler_tumor_annotation + ), + # get the top cell types for plotting later + singler_tumor_annotation_lumped = dplyr::if_else(is.na(singler_tumor_annotation), "Not labeled", singler_tumor_annotation) |> + # there are warnings because we have some that have less than 7 cell types (ones that only did tumor/normal) + # this is why we ignore warnings in this chunk + forcats::fct_lump_n(7, other_level = "All remaining cell types", ties.method = "first") |> + forcats::fct_infreq() |> + forcats::fct_relevel("All remaining cell types", after = Inf), + # make a label that combines tumor_new with tumor_original + singler_combined_tumor = dplyr::if_else( + singler_tumor_annotation_lumped %in% c("tumor_original", "tumor_new"), + "tumor", + singler_tumor_annotation_lumped + ) |> + forcats::fct_relevel("tumor", after = 0) |> + forcats::fct_relevel("All remaining cell types", after = Inf), + # make a label that is just tumor or normal + tumor_normal_classification = dplyr::if_else(singler_combined_tumor == "tumor", "tumor", "normal") |> + forcats::fct_relevel("tumor", after = 0) + ) +``` + + +```{r} +# read in marker genes table +marker_genes_df <- readr::read_tsv(params$marker_genes_file, show_col_types = FALSE) |> + # account for genes being from multiple sources + dplyr::select(cell_type, ensembl_gene_id, gene_symbol) |> + dplyr::distinct() + +# get list of all cell types found +cell_types <- unique(marker_genes_df$cell_type) + +# get the sum of expression of all genes for each cell type +gene_exp_df <- cell_types |> + purrr::map(\(type){ + calculate_sum_markers(marker_genes_df, type) + }) |> + purrr::reduce(dplyr::inner_join, by = "barcodes") + +# join sum expression columns with classification df +classification_df <- classification_df |> + dplyr::left_join(gene_exp_df) +``` + +```{r} +# get columns that correspond to marker genes and gene sets to help with plotting later +marker_gene_columns <- colnames(classification_df)[which(endsWith(colnames(classification_df), "_sum"))] +geneset_columns <- colnames(classification_df)[which(startsWith(colnames(classification_df), "mean-"))] +``` + +## UMAP of all cell type annotations + +Here we show the annotated cell types on a UMAP. +The top 7 cell types are shown for each reference used and all other cells are lumped together. + +```{r, fig.height=8, fig.width=8} +ggplot(classification_df, aes(x = UMAP1, y = UMAP2, color = singler_tumor_annotation_lumped)) + + # set points for all "other" points + geom_point( + data = dplyr::select( + classification_df, -singler_tumor_annotation_lumped + ), + color = "gray80", + alpha = 0.5, + size = 0.1 + ) + + # set points for desired cell type + geom_point(size = 0.1, alpha = 0.5) + + facet_wrap( + vars(singler_tumor_annotation_lumped), + ncol = 3 + ) + + scale_color_brewer(palette = "Dark2") + + # remove axis numbers and background grid + scale_x_continuous(labels = NULL, breaks = NULL) + + scale_y_continuous(labels = NULL, breaks = NULL) + + guides( + color = guide_legend( + title = "Cell type", + # more visible points in legend + override.aes = list( + alpha = 1, + size = 1.5 + ) + )) + + theme( + legend.position = c(0.67, 0.33), + legend.justification = c("left", "top"), + legend.title.align = 0.5, + # use slightly smaller legend text, which helps legend fit and prevents + # long wrapped labels from bunching up + legend.text = element_text(size = rel(0.85)), + legend.key.height = unit(0.75, "cm"), + aspect.ratio = 1 + ) +``` + +## Evaluate output from `SingleR` + +The next three sections will evaluate the annotations by looking at expression of marker genes and gene set scores. + +The gene expression for all marker genes for tumor cells, endothelial cells, mesenchymal-like cells, and immune cells was summed together for each cell. +Marker genes were obtained from [Visser et al.](https://doi.org/10.1158/2767-9764.CRC-23-0027). + +Gene set scores are determined by calculating the mean expression of all genes in a given gene set. +Scores are calculated for the following gene sets from `MsigDb`: + + - [`ZHANG_TARGETS_OF_EWSR1_FLI1_FUSION`](https://www.gsea-msigdb.org/gsea/msigdb/human/geneset/ZHANG_TARGETS_OF_EWSR1_FLI1_FUSION.html) + - [`RIGGI_EWING_SARCOMA_PROGENITOR_UP`](https://www.gsea-msigdb.org/gsea/msigdb/human/geneset/RIGGI_EWING_SARCOMA_PROGENITOR_UP.html?ex=1) + - [`SILIGAN_TARGETS_OF_EWS_FLI1_FUSION_DN`](https://www.gsea-msigdb.org/gsea/msigdb/cards/SILIGAN_TARGETS_OF_EWS_FLI1_FUSION_DN) + +For both marker gene expression and gene set scores, the following plots will be shown: + +- Heatmap showing gene expression or gene set scores across all cells. +Here the rows correspond to the gene list and the columns are the cells. +The annotations are shown below the heatmap. +- Density plot showing gene expression or gene set scores across all cells. +Each row is a cell type and the expression or score is plotted on the x-axis. + + +### Tumor vs. Normal + +In this section we show just the cells that are considered tumor (including both `tumor_original` and `tumor_new`) and normal, lumping all non-tumor cell types together. + +**Marker gene expression** + +```{r} +full_celltype_heatmap(classification_df, marker_gene_columns, "tumor_normal_classification") +``` + + +```{r} +plot_density(classification_df, + "tumor_sum", + "tumor_normal_classification") +``` + +**Gene set scores** + +```{r} +full_celltype_heatmap(classification_df, geneset_columns, "tumor_normal_classification") +``` + +```{r} +geneset_columns |> + purrr::map(\(column){ + plot_density(classification_df, + column, + "tumor_normal_classification") + }) +``` + + +### Tumor vs. all other cell types + +In this section we show all tumor cells together (including both `tumor_original` and `tumor_new`) and the top 5 most represented normal cell types. + +**Marker gene expression** + +```{r} +full_celltype_heatmap(classification_df, marker_gene_columns, "singler_combined_tumor") +``` + +```{r} +marker_gene_columns |> + purrr::map(\(column){ + plot_density(classification_df, + column, + "singler_combined_tumor") + }) +``` + + +**Gene set scores** + +```{r} +full_celltype_heatmap(classification_df, geneset_columns, "singler_combined_tumor") +``` + +```{r} +geneset_columns |> + purrr::map(\(column){ + plot_density(classification_df, + column, + "singler_combined_tumor") + }) +``` + +### Compare newly annotated tumor cells to existing tumor cells + +In this section we show both tumor cell types, `tumor_original` and `tumor_new`, separately and the top 5 most represented normal cell types. + +**Marker gene expression** + +```{r} +full_celltype_heatmap(classification_df, marker_gene_columns, "singler_tumor_annotation_lumped") +``` + + +```{r} +marker_gene_columns |> + purrr::map(\(column){ + plot_density(classification_df, + column, + "singler_tumor_annotation_lumped") + }) +``` + + +**Gene set scores** + +```{r} +full_celltype_heatmap(classification_df, geneset_columns, "singler_tumor_annotation_lumped") +``` + +```{r} +geneset_columns |> + purrr::map(\(column){ + plot_density(classification_df, + column, + "singler_tumor_annotation_lumped") + }) +``` + + +## Session Info + +```{r session info} +# record the versions of the packages used in this analysis and other environment information +sessionInfo() +``` + + From a661afaa231e685fa2a2044f95f233a4b0841144 Mon Sep 17 00:00:00 2001 From: Ally Hawkins Date: Fri, 19 Jul 2024 14:49:22 -0500 Subject: [PATCH 02/10] add plotting functions --- .../scripts/utils/tumor-validation-helpers.R | 121 ++++++++++++++++++ 1 file changed, 121 insertions(+) diff --git a/analyses/cell-type-ewings/scripts/utils/tumor-validation-helpers.R b/analyses/cell-type-ewings/scripts/utils/tumor-validation-helpers.R index 5be97e942..b028fb7aa 100644 --- a/analyses/cell-type-ewings/scripts/utils/tumor-validation-helpers.R +++ b/analyses/cell-type-ewings/scripts/utils/tumor-validation-helpers.R @@ -117,6 +117,44 @@ create_marker_gene_df <- function( return(plot_markers_df) } +# calculate the sum of expression for all markers in a given cell type +# takes as input the marker gene df with `cell_type` and `ensembl_gene_id` as columns +# For any genes that are in the specified `cell_type`, sum of the logcounts is calculated +# output is a data frame with barcodes and `{cell_type}_sum` +calculate_sum_markers <- function(marker_genes_df, + type){ + + # get list of marker genes to use + marker_genes <- marker_genes_df |> + dplyr::filter(cell_type == type) |> + dplyr::pull(ensembl_gene_id) + + # get the gene expression counts for all marker genes + sum_exp <- logcounts(sce[marker_genes, ]) |> + as.matrix() |> + t() |> + rowSums() + + df <- data.frame( + barcodes = names(sum_exp), + sum_exp = sum_exp + ) + + # get rid of extra " cells" at end of some of the names + type <- stringr::str_remove(type, " cells") + + colnames(df) <- c( + "barcodes", + # add ref name to colnames for easier joining + glue::glue("{type}_sum") + ) + + return(df) +} + + + + # Heatmaps --------------------------------------------------------------------- # create a heatmap where rows are marker genes or gene sets and columns are cells @@ -199,3 +237,86 @@ plot_cnv_heatmap <- function( ) return(heatmap) } + +# create heatmap with gene set as rows and cells as columns +# this includes adding an annotation column labeling cells +# colors are automatically determined using the `Dark2` palette and assigning to cell types listed in the `annotation_column` +full_celltype_heatmap <- function(classification_df, + gene_exp_columns, + annotation_column){ + + # get list of all cell types being plotted and assign colors from Dark2 palette + cell_types <- unique(classification_df[[annotation_column]]) + num_cell_types <- length(cell_types) + colors <- palette.colors(palette = "Dark2") |> + head(n = num_cell_types) |> + purrr::set_names(cell_types) + + # create annotation for heatmap + annotation <- ComplexHeatmap::columnAnnotation( + singler = classification_df[[annotation_column]], + col = list( + singler = colors + ) + ) + + # build matrix for heatmap cells x gene set sum or mean + heatmap_mtx <- classification_df |> + dplyr::select(barcodes, gene_exp_columns) |> + tibble::column_to_rownames("barcodes") |> + as.matrix() + colnames(heatmap_mtx) <- stringr::str_remove(colnames(heatmap_mtx), "_sum|mean-") + + # plot heatmap of marker genes + plot_gene_heatmap(t(heatmap_mtx), + row_title = "", + legend_title = "Marker gene \nexpression", + annotation = annotation) + +} + +# Density plots ---------------------------------------------------------------- + +# This creates a faceted density plot where cell types present in the `annotation_column` +# are on the y-axis and the expression in `gene_exp_column` is on the x-axis +# this is mostly copied from the cellassign probability density plot in +# https://github.com/AlexsLemonade/scpca-nf/blob/f215046b3a9d9ddc50fac1a198a57e1bb813aeae/templates/qc_report/celltypes_supplemental_report.rmd#L759 + + +plot_density <- function(classification_df, + gene_exp_column, + annotation_column){ + + # pull out gene set name to create the plot title + geneset_name <- stringr::str_remove(gene_exp_column, "_sum|mean-") + + ggplot(classification_df) + + aes(x = !!sym(gene_exp_column)) + # marker gene set column + geom_density( + fill = "grey65", + linewidth = 0.25 + ) + + labs( + x = "Gene set expression", + y = "Cell type annotation", + title = geneset_name + ) + + scale_alpha_identity() + + facet_grid( + rows = vars(!!sym(annotation_column)), + switch = "y", # make sure labels are readable + scales = "free_y" + ) + + theme( + strip.background = element_blank(), + strip.text.y.left = element_text( + angle = 0, + hjust = 1 + ), + axis.text.y = element_blank(), + axis.ticks.y = element_blank(), + panel.grid.minor = element_blank(), + panel.grid.major.y = element_blank(), + panel.spacing = unit(0.02, "in") + ) +} From faf19a7379adaf167c7333e7fbaa1ea14b74279b Mon Sep 17 00:00:00 2001 From: Ally Hawkins Date: Mon, 22 Jul 2024 16:20:11 -0500 Subject: [PATCH 03/10] move umap function --- .../scripts/utils/tumor-validation-helpers.R | 53 +++++++++++++++++-- 1 file changed, 50 insertions(+), 3 deletions(-) diff --git a/analyses/cell-type-ewings/scripts/utils/tumor-validation-helpers.R b/analyses/cell-type-ewings/scripts/utils/tumor-validation-helpers.R index b028fb7aa..84dda4b95 100644 --- a/analyses/cell-type-ewings/scripts/utils/tumor-validation-helpers.R +++ b/analyses/cell-type-ewings/scripts/utils/tumor-validation-helpers.R @@ -264,11 +264,12 @@ full_celltype_heatmap <- function(classification_df, heatmap_mtx <- classification_df |> dplyr::select(barcodes, gene_exp_columns) |> tibble::column_to_rownames("barcodes") |> - as.matrix() - colnames(heatmap_mtx) <- stringr::str_remove(colnames(heatmap_mtx), "_sum|mean-") + as.matrix() |> + t() + rownames(heatmap_mtx) <- stringr::str_remove(rownames(heatmap_mtx), "_sum|mean-") # plot heatmap of marker genes - plot_gene_heatmap(t(heatmap_mtx), + plot_gene_heatmap(heatmap_mtx, row_title = "", legend_title = "Marker gene \nexpression", annotation = annotation) @@ -320,3 +321,49 @@ plot_density <- function(classification_df, panel.spacing = unit(0.02, "in") ) } + + +# UMAPs ------------------------------------------------------------------------ + +# creates a faceted UMAP where each panel shows the cell type of interest in color and +# all other cells in grey +# adapted from `faceted_umap()` function in `celltypes_qc.rmd` from `scpca-nf` +# https://github.com/AlexsLemonade/scpca-nf/blob/main/templates/qc_report/celltypes_qc.rmd#L143 + +plot_faceted_umap <- function(classification_df, + annotation_column) { + + ggplot(classification_df, aes(x = UMAP1, y = UMAP2, color = {{ annotation_column }})) + + # set points for all "other" points + geom_point( + data = dplyr::select( + classification_df, - {{ annotation_column }} + ), + color = "gray80", + alpha = 0.5, + size = 0.1 + ) + + # set points for desired cell type + geom_point(size = 0.1, alpha = 0.5) + + facet_wrap( + vars({{ annotation_column }}), + ncol = 3 + ) + + scale_color_brewer(palette = "Dark2") + + # remove axis numbers and background grid + scale_x_continuous(labels = NULL, breaks = NULL) + + scale_y_continuous(labels = NULL, breaks = NULL) + + guides( + color = guide_legend( + title = "Cell type", + # more visible points in legend + override.aes = list( + alpha = 1, + size = 1.5 + ) + )) + + theme( + aspect.ratio = 1 + ) + +} From 37ef45c814d13452efc4b0d3d90c6f38a436a656 Mon Sep 17 00:00:00 2001 From: Ally Hawkins Date: Mon, 22 Jul 2024 16:20:37 -0500 Subject: [PATCH 04/10] add in singler vs. aucell comparison --- .../singler-workflow/01-singler-results.Rmd | 159 ++++++++---------- 1 file changed, 70 insertions(+), 89 deletions(-) diff --git a/analyses/cell-type-ewings/template_notebooks/singler-workflow/01-singler-results.Rmd b/analyses/cell-type-ewings/template_notebooks/singler-workflow/01-singler-results.Rmd index 5e86ea5e9..213cd926f 100644 --- a/analyses/cell-type-ewings/template_notebooks/singler-workflow/01-singler-results.Rmd +++ b/analyses/cell-type-ewings/template_notebooks/singler-workflow/01-singler-results.Rmd @@ -77,47 +77,43 @@ geneset_scores_df <- readr::read_tsv(params$geneset_scores_file) ## Prep for plots ```{r} -# original library annotation -orig_tumor <- glue::glue("tumor-{params$library_id}") - # generate classification df to use for plots classification_df <- sce |> scuttle::makePerCellDF(use.dimred = "UMAP") |> - # replace UMAP.1 with UMAP1 - dplyr::rename_with( - \(x) stringr::str_replace(x, "^UMAP\\.", "UMAP") - ) |> - # get rid of excess columns - dplyr::select(barcodes, UMAP1, UMAP2) |> + # replace UMAP.1 with UMAP1 and get rid of excess columns + dplyr::select(barcodes, UMAP1 = UMAP.1, UMAP2 = UMAP.2) |> # join with previous annotations, singler results, and gene set scores dplyr::left_join(singler_results_df, by = "barcodes") |> dplyr::left_join(geneset_scores_df, by = "barcodes") |> dplyr::mutate( + # set factor order for aucell + aucell_annotation = forcats::fct_relevel(aucell_annotation, "tumor", after= 0), # first remove other library IDs from tumor annotations # we only care about whether or not it was from the original library or a new library - singler_tumor_annotation = dplyr::case_when( - singler_tumor_annotation == orig_tumor ~ "tumor_original", - stringr::str_detect(singler_tumor_annotation, "tumor") ~ "tumor_new", - .default = singler_tumor_annotation - ), + singler_annotation = dplyr::case_when( + stringr::str_detect(singler_annotation, "tumor") ~ "tumor", + is.na(singler_annotation) ~ "unknown", # make sure to separate out unknown labels + .default = singler_annotation + ) |> + forcats::fct_relevel("tumor", after = 0), # get the top cell types for plotting later - singler_tumor_annotation_lumped = dplyr::if_else(is.na(singler_tumor_annotation), "Not labeled", singler_tumor_annotation) |> + singler_lumped = singler_annotation |> # there are warnings because we have some that have less than 7 cell types (ones that only did tumor/normal) # this is why we ignore warnings in this chunk forcats::fct_lump_n(7, other_level = "All remaining cell types", ties.method = "first") |> forcats::fct_infreq() |> forcats::fct_relevel("All remaining cell types", after = Inf), - # make a label that combines tumor_new with tumor_original - singler_combined_tumor = dplyr::if_else( - singler_tumor_annotation_lumped %in% c("tumor_original", "tumor_new"), - "tumor", - singler_tumor_annotation_lumped - ) |> - forcats::fct_relevel("tumor", after = 0) |> - forcats::fct_relevel("All remaining cell types", after = Inf), # make a label that is just tumor or normal - tumor_normal_classification = dplyr::if_else(singler_combined_tumor == "tumor", "tumor", "normal") |> - forcats::fct_relevel("tumor", after = 0) + singler_tumor_normal = dplyr::if_else(!(singler_annotation %in% c("tumor", "unknown")), "normal", singler_annotation) |> + forcats::fct_relevel("tumor", after = 0), + # get the consensus between aucell and singler + consensus = dplyr::case_when( + singler_annotation == "tumor" & aucell_annotation == "tumor" ~ "tumor_both", + singler_annotation == "tumor" & aucell_annotation != "tumor" ~ "tumor_singler", + singler_annotation != "tumor" & aucell_annotation == "tumor" ~ "tumor_aucell", + .default = "normal" + ) |> + forcats::fct_relevel("normal", after = Inf) ) ``` @@ -141,7 +137,7 @@ gene_exp_df <- cell_types |> # join sum expression columns with classification df classification_df <- classification_df |> - dplyr::left_join(gene_exp_df) + dplyr::left_join(gene_exp_df, by = "barcodes") ``` ```{r} @@ -150,56 +146,44 @@ marker_gene_columns <- colnames(classification_df)[which(endsWith(colnames(class geneset_columns <- colnames(classification_df)[which(startsWith(colnames(classification_df), "mean-"))] ``` -## UMAP of all cell type annotations +## UMAPs of all cell type annotations Here we show the annotated cell types on a UMAP. The top 7 cell types are shown for each reference used and all other cells are lumped together. ```{r, fig.height=8, fig.width=8} -ggplot(classification_df, aes(x = UMAP1, y = UMAP2, color = singler_tumor_annotation_lumped)) + - # set points for all "other" points - geom_point( - data = dplyr::select( - classification_df, -singler_tumor_annotation_lumped - ), - color = "gray80", - alpha = 0.5, - size = 0.1 - ) + - # set points for desired cell type - geom_point(size = 0.1, alpha = 0.5) + - facet_wrap( - vars(singler_tumor_annotation_lumped), - ncol = 3 - ) + - scale_color_brewer(palette = "Dark2") + - # remove axis numbers and background grid - scale_x_continuous(labels = NULL, breaks = NULL) + - scale_y_continuous(labels = NULL, breaks = NULL) + - guides( - color = guide_legend( - title = "Cell type", - # more visible points in legend - override.aes = list( - alpha = 1, - size = 1.5 - ) - )) + - theme( - legend.position = c(0.67, 0.33), - legend.justification = c("left", "top"), - legend.title.align = 0.5, - # use slightly smaller legend text, which helps legend fit and prevents - # long wrapped labels from bunching up - legend.text = element_text(size = rel(0.85)), - legend.key.height = unit(0.75, "cm"), - aspect.ratio = 1 - ) +plot_faceted_umap(classification_df, singler_lumped) ``` -## Evaluate output from `SingleR` -The next three sections will evaluate the annotations by looking at expression of marker genes and gene set scores. +## Comparison between `SingleR` and `AUCell` + +Below we show the cells that are annotated as tumor by both methods, just `AUCell` and just `SingleR` along with all normal cells. + +```{r} +plot_faceted_umap(classification_df, consensus) +``` + +We then calculate the confusion matrix between tumor and normal cells called in `SingleR` and `AUCell`. +The rows here correspond to the `AUCell` annotation and columns are from `SingleR`. + +```{r} +caret_df <- classification_df |> + dplyr::filter(singler_tumor_normal != "unknown") |> + dplyr::mutate(singler_tumor_normal = droplevels(singler_tumor_normal, "unknown")) + +caret::confusionMatrix( + table( + caret_df$aucell_annotation, + caret_df$singler_tumor_normal + ) +) +``` + + +## Validate output from `SingleR` + +The next three sections will evaluate the annotations output obtained from using `SingleR` by looking at expression of marker genes and gene set scores. The gene expression for all marker genes for tumor cells, endothelial cells, mesenchymal-like cells, and immune cells was summed together for each cell. Marker genes were obtained from [Visser et al.](https://doi.org/10.1158/2767-9764.CRC-23-0027). @@ -222,25 +206,25 @@ Each row is a cell type and the expression or score is plotted on the x-axis. ### Tumor vs. Normal -In this section we show just the cells that are considered tumor (including both `tumor_original` and `tumor_new`) and normal, lumping all non-tumor cell types together. +In this section we show just the cells that are considered tumor and normal, lumping all non-tumor cell types together. **Marker gene expression** ```{r} -full_celltype_heatmap(classification_df, marker_gene_columns, "tumor_normal_classification") +full_celltype_heatmap(classification_df, marker_gene_columns, "singler_tumor_normal") ``` ```{r} plot_density(classification_df, "tumor_sum", - "tumor_normal_classification") + "singler_tumor_normal") ``` **Gene set scores** ```{r} -full_celltype_heatmap(classification_df, geneset_columns, "tumor_normal_classification") +full_celltype_heatmap(classification_df, geneset_columns, "singler_tumor_normal") ``` ```{r} @@ -248,19 +232,19 @@ geneset_columns |> purrr::map(\(column){ plot_density(classification_df, column, - "tumor_normal_classification") + "singler_tumor_normal") }) ``` ### Tumor vs. all other cell types -In this section we show all tumor cells together (including both `tumor_original` and `tumor_new`) and the top 5 most represented normal cell types. +In this section we show all tumor cells and the top 5 most represented normal cell types. **Marker gene expression** ```{r} -full_celltype_heatmap(classification_df, marker_gene_columns, "singler_combined_tumor") +full_celltype_heatmap(classification_df, marker_gene_columns, "singler_lumped") ``` ```{r} @@ -268,7 +252,7 @@ marker_gene_columns |> purrr::map(\(column){ plot_density(classification_df, column, - "singler_combined_tumor") + "singler_lumped") }) ``` @@ -276,7 +260,7 @@ marker_gene_columns |> **Gene set scores** ```{r} -full_celltype_heatmap(classification_df, geneset_columns, "singler_combined_tumor") +full_celltype_heatmap(classification_df, geneset_columns, "singler_lumped") ``` ```{r} @@ -284,35 +268,32 @@ geneset_columns |> purrr::map(\(column){ plot_density(classification_df, column, - "singler_combined_tumor") + "singler_lumped") }) ``` -### Compare newly annotated tumor cells to existing tumor cells +### Tumor cells annotatd by `SingleR` and `AUCell` -In this section we show both tumor cell types, `tumor_original` and `tumor_new`, separately and the top 5 most represented normal cell types. +Here we compare the marker gene expression and gene set scores for cells annotated as tumor by both methods, just `SingleR`, or just `AUCell` and compare to all normal cells. **Marker gene expression** ```{r} -full_celltype_heatmap(classification_df, marker_gene_columns, "singler_tumor_annotation_lumped") +full_celltype_heatmap(classification_df, marker_gene_columns, "consensus") ``` ```{r} -marker_gene_columns |> - purrr::map(\(column){ - plot_density(classification_df, - column, - "singler_tumor_annotation_lumped") - }) +plot_density(classification_df, + "tumor_sum", + "consensus") ``` **Gene set scores** ```{r} -full_celltype_heatmap(classification_df, geneset_columns, "singler_tumor_annotation_lumped") +full_celltype_heatmap(classification_df, geneset_columns, "consensus") ``` ```{r} @@ -320,7 +301,7 @@ geneset_columns |> purrr::map(\(column){ plot_density(classification_df, column, - "singler_tumor_annotation_lumped") + "consensus") }) ``` From da444353fa5cbad45d0fa45f16700956cbb1c9d9 Mon Sep 17 00:00:00 2001 From: Ally Hawkins Date: Mon, 22 Jul 2024 16:22:07 -0500 Subject: [PATCH 05/10] some comment updates --- .../singler-workflow/01-singler-results.Rmd | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/analyses/cell-type-ewings/template_notebooks/singler-workflow/01-singler-results.Rmd b/analyses/cell-type-ewings/template_notebooks/singler-workflow/01-singler-results.Rmd index 213cd926f..cc5bd6909 100644 --- a/analyses/cell-type-ewings/template_notebooks/singler-workflow/01-singler-results.Rmd +++ b/analyses/cell-type-ewings/template_notebooks/singler-workflow/01-singler-results.Rmd @@ -18,10 +18,8 @@ params: This report summarizes the classification of `r params$library_id` using the following approach: - Tumor cells were identified by using `AUCell`. -These cells are labeled as `tumor_original`. -- All other cells were labeled using `SingleR` with a combination of three references: +- All cells were labeled using `SingleR` with a combination of three references: - All tumor cells annotated by `AUCell` for all other libraries in `SCPCP000015`. - Any cells that are labeled as tumor using this reference are labeled as `tumor_new`. - `BlueprintEncodeData` from `celldex` - `HumanPrimaryCellAtlasData` from `celldex` From cb1615caa5e8fc5e2addf7cd83e2b90ff2e427e7 Mon Sep 17 00:00:00 2001 From: "Stephanie J. Spielman" Date: Tue, 23 Jul 2024 12:40:49 -0400 Subject: [PATCH 06/10] update comments to trigger styler --- .../scripts/00_format-benchmark-data.R | 4 ++-- .../doublet-detection/scripts/01a_run-scdblfinder.R | 11 ++++------- 2 files changed, 6 insertions(+), 9 deletions(-) diff --git a/analyses/doublet-detection/scripts/00_format-benchmark-data.R b/analyses/doublet-detection/scripts/00_format-benchmark-data.R index 8f3d15825..2563630a1 100755 --- a/analyses/doublet-detection/scripts/00_format-benchmark-data.R +++ b/analyses/doublet-detection/scripts/00_format-benchmark-data.R @@ -9,8 +9,8 @@ # For each dataset, we normalize counts and calculate PCA, but we perform no additional filtering. # The exported files are named `-.. # Each has a variable `ground_truth_doublets` representing the singlet/doublet calls: -# - In the SCE file, this is in the colData slot -# - In the AnnData file, this is in the obs slot +# - In the SCE file, this is in the colData slot. +# - In the AnnData file, this is in the obs slot. # Load libraries suppressPackageStartupMessages({ diff --git a/analyses/doublet-detection/scripts/01a_run-scdblfinder.R b/analyses/doublet-detection/scripts/01a_run-scdblfinder.R index a53de002c..fbe857551 100755 --- a/analyses/doublet-detection/scripts/01a_run-scdblfinder.R +++ b/analyses/doublet-detection/scripts/01a_run-scdblfinder.R @@ -1,7 +1,7 @@ #!/usr/bin/env Rscript # This script runs scDblFinder on an SCE file and exports a TSV file of results. -# If the SCE has fewer than 10 droplets, results will not be calculated and the exported TSV will contain NA values. +# If the SCE has fewer than 10 droplets, results will not be calculated; the exported TSV will contain only NA values. # Load libraries ------ suppressPackageStartupMessages({ @@ -28,7 +28,6 @@ run_scdblfinder <- function(sce, random_seed = NULL, columns_to_keep = c("barcodes", "score", "class"), ...) { - # first check multiple samples, which requires a random seed if (!is.null(sample_var) && is.null(random_seed)) { if (!sample_var %in% colnames(colData(sce))) { @@ -113,13 +112,13 @@ opts <- parse_args(OptionParser(option_list = option_list)) # Check input arguments and set up files, directories ------ if (!file.exists(opts$input_sce_file)) { - glue::glue("Could not find input SCE file, expected at: `{opts$input_sce_file}`.") + glue::glue("Could not find input SCE file, expected at: `{opts$input_sce_file}`.") } if (!stringr::str_ends(opts$output_file, ".tsv")) { stop("The output TSV file must end in .tsv.") } -fs::dir_create( dirname(opts$output_file) ) # create output directory as needed +fs::dir_create(dirname(opts$output_file)) # create output directory as needed set.seed(opts$random_seed) # Detect doublets and export TSV file with inferences ----- @@ -146,9 +145,7 @@ if (ncells < cell_threshold) { na_df$cxds_score <- NA } readr::write_tsv(na_df, opts$output_file) - } else { - keep_columns <- c( "barcodes", "score", @@ -166,5 +163,5 @@ if (ncells < cell_threshold) { random_seed = opts$random_seed, columns_to_keep = keep_columns ) |> - readr::write_tsv(opts$output_file) + readr::write_tsv(opts$output_file) } From 0c4d42ea4cbe8a252cf525ec7dcc737fbf29235c Mon Sep 17 00:00:00 2001 From: "Stephanie J. Spielman" Date: Tue, 23 Jul 2024 12:41:14 -0400 Subject: [PATCH 07/10] update comments to trigger styler --- .../benchmark-test-data/generate-benchmark-test-data.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/analyses/doublet-detection/benchmark-test-data/generate-benchmark-test-data.R b/analyses/doublet-detection/benchmark-test-data/generate-benchmark-test-data.R index 1d0946086..8236f8faa 100755 --- a/analyses/doublet-detection/benchmark-test-data/generate-benchmark-test-data.R +++ b/analyses/doublet-detection/benchmark-test-data/generate-benchmark-test-data.R @@ -2,7 +2,8 @@ # This script generates test data files for use when testing the benchmarking script `../run_doublet-detection-benchmark.sh`. # Saved in `./data.zip`, the test data files are subsetted version of the datasets used for benchmarking, as described in `../README.md`. -# This script assumes that ../scratch/benchmark-datasets/raw contains the .rds files from 's `real_datasets.zip` file and that there is an available `zip` executable on the system. +# This script assumes that ../scratch/benchmark-datasets/raw contains the .rds files from 's `real_datasets.zip` file, +# and that there is an available `zip` executable on the system. library(Matrix) set.seed(2024) From 143be65ad769ba2c0b95e4d880f40dd4c2150db8 Mon Sep 17 00:00:00 2001 From: "Stephanie J. Spielman" Date: Tue, 23 Jul 2024 12:42:44 -0400 Subject: [PATCH 08/10] spacing to trigger styling --- .../03_compare-benchmark-results.Rmd | 222 +++++++++--------- .../02_explore-benchmark-results.Rmd | 124 +++++----- 2 files changed, 175 insertions(+), 171 deletions(-) diff --git a/analyses/doublet-detection/exploratory-notebooks/03_compare-benchmark-results.Rmd b/analyses/doublet-detection/exploratory-notebooks/03_compare-benchmark-results.Rmd index cfa6b277a..9b578b15c 100644 --- a/analyses/doublet-detection/exploratory-notebooks/03_compare-benchmark-results.Rmd +++ b/analyses/doublet-detection/exploratory-notebooks/03_compare-benchmark-results.Rmd @@ -28,7 +28,6 @@ This notebook uses a doublet threshold of `>=0.5` for `cxds`, which may not be u ### Packages - ```{r packages} suppressPackageStartupMessages({ library(SingleCellExperiment) @@ -43,7 +42,7 @@ theme_set( theme( legend.title = element_text(size = rel(1.1)), legend.text = element_text(size = rel(1.1)) - ) + ) ) # define threshold used to call cxds @@ -74,7 +73,7 @@ plot_pca_calls <- function(df, aes( x = PC1, y = PC2, - color = {{color_column}} + color = {{ color_column }} ) + geom_point( size = 0.75, @@ -87,9 +86,10 @@ plot_pca_calls <- function(df, plot <- plot + scale_color_manual( name = "", - values = c("doublet" = "black", "singlet" = "gray90")) + + values = c("doublet" = "black", "singlet" = "gray90") + ) + geom_point( - data = dplyr::filter(df, {{color_column}} == "doublet"), + data = dplyr::filter(df, {{ color_column }} == "doublet"), color = "black", size = 0.75 ) @@ -97,9 +97,11 @@ plot_pca_calls <- function(df, plot <- plot + scale_color_manual( name = "Consensus", - values = c("ambiguous" = "yellow", - "doublet" = "black", - "singlet" = "gray90") + values = c( + "ambiguous" = "yellow", + "doublet" = "black", + "singlet" = "gray90" + ) ) + geom_point( data = dplyr::filter(df, consensus_class != "singlet"), @@ -128,20 +130,22 @@ plot_pca_metrics <- function(df, color_column) { ) ggplot(df) + - aes(x = PC1, - y = PC2, - color = {{color_column}}) + - geom_point( - size = 0.75, - alpha = 0.6 - ) + - geom_point( - data = dplyr::filter(df, {{color_column}} %in% c("fp", "fn")), - size = 0.75 - ) + - scale_color_manual(name = "Call type", values = metric_colors) + - ggtitle("Consensus call metrics") + - guides(color = guide_legend(override.aes = list(size = 2))) + aes( + x = PC1, + y = PC2, + color = {{ color_column }} + ) + + geom_point( + size = 0.75, + alpha = 0.6 + ) + + geom_point( + data = dplyr::filter(df, {{ color_column }} %in% c("fp", "fn")), + size = 0.75 + ) + + scale_color_manual(name = "Call type", values = metric_colors) + + ggtitle("Consensus call metrics") + + guides(color = guide_legend(override.aes = list(size = 2))) } ``` @@ -161,7 +165,6 @@ dataset_names <- stringr::str_split_1(params$datasets, pattern = " ") doublet_df_list <- dataset_names |> purrr::map( \(dataset) { - scdbl_tsv <- file.path(result_dir, glue::glue("{dataset}_scdblfinder.tsv")) scrub_tsv <- file.path(result_dir, glue::glue("{dataset}_scrublet.tsv")) sce_file <- file.path(data_dir, dataset, glue::glue("{dataset}.rds")) @@ -172,7 +175,7 @@ doublet_df_list <- dataset_names |> barcodes, cxds_score, scdbl_score = score, - scdbl_prediction = class + scdbl_prediction = class ) |> # add cxds calls at `cxds_threshold` threshold dplyr::mutate( @@ -191,9 +194,10 @@ doublet_df_list <- dataset_names |> scuttle::makePerCellDF(sce, use.dimred = "PCA") |> tibble::rownames_to_column(var = "barcodes") |> dplyr::select(barcodes, - ground_truth = ground_truth_doublets, - PC1 = PCA.1, - PC2 = PCA.2) |> + ground_truth = ground_truth_doublets, + PC1 = PCA.1, + PC2 = PCA.2 + ) |> dplyr::left_join( scrub_df, by = "barcodes" @@ -217,7 +221,6 @@ This section contains upset plots that show overlap across doublet calls from ea doublet_df_list |> purrr::iwalk( \(df, dataset) { - doublet_barcodes <- list( "scDblFinder" = df$barcodes[df$scdbl_prediction == "doublet"], "scrublet" = df$barcodes[df$scrublet_prediction == "doublet"], @@ -229,12 +232,10 @@ doublet_df_list |> dataset, x = 0.65, y = 0.95, - gp = grid::gpar(fontsize=16) + gp = grid::gpar(fontsize = 16) ) - } ) - ``` @@ -251,10 +252,8 @@ To this end, we'll create some new columns for each dataset: doublet_df_list <- doublet_df_list |> purrr::map( \(dataset_df) { - dataset_df |> dplyr::rowwise() |> - dplyr::mutate( # Add column `consensus_call` # "doublet" if all three methods are doublet, and "singlet" otherwise @@ -280,7 +279,7 @@ doublet_df_list <- doublet_df_list |> consensus_call == "singlet" && ground_truth == "doublet" ~ "fn" ) ) - } + } ) ``` @@ -306,15 +305,16 @@ This section plots the PCA for each dataset, clockwise from the top left: - False negative: red ```{r, fig.width = 12, fig.height = 9} -pc_color_columns <- c("scDblFinder prediction" = "scdbl_prediction", - "scrublet prediction" = "scrublet_prediction", - "cxds prediction" = "cxds_prediction", - "Ground truth" = "ground_truth", - "Consensus call" = "consensus_call") +pc_color_columns <- c( + "scDblFinder prediction" = "scdbl_prediction", + "scrublet prediction" = "scrublet_prediction", + "cxds prediction" = "cxds_prediction", + "Ground truth" = "ground_truth", + "Consensus call" = "consensus_call" +) doublet_df_list |> purrr::imap( \(df, dataset) { - pc_plot_list <- pc_color_columns |> purrr::imap( \(color_column, plot_title) { @@ -323,7 +323,8 @@ doublet_df_list |> color_column = !!sym(color_column), plot_type = "calls", title = plot_title - ) + theme(legend.position = "none")} + ) + theme(legend.position = "none") + } ) # plot for consensus_class, which needs three colors @@ -343,8 +344,9 @@ doublet_df_list |> patchwork::wrap_plots(pc_plot_list) + plot_annotation( - glue::glue("PCA for {dataset}"), - theme = theme(plot.title = element_text(size = 16))) + + glue::glue("PCA for {dataset}"), + theme = theme(plot.title = element_text(size = 16)) + ) + plot_layout(guides = "collect") } ) @@ -359,35 +361,35 @@ This section shows a table of the consensus class counts and calculates a confus metric_df <- doublet_df_list |> purrr::imap( \(df, dataset) { - print(glue::glue("======================== {dataset} ========================")) - - cat("Table of consensus class counts:") - print(table(df$consensus_class)) - - cat("\n\n") - - # run confusion matrix only if there are both singlets and doublets - if ("singlet" %in% df$consensus_call & "doublet" %in% df$consensus_call) { - confusion_result <- caret::confusionMatrix( - # truth should be first - table( - "Truth" = df$ground_truth, - "Consensus prediction" = df$consensus_call - ), - positive = "doublet" - ) - - print(confusion_result) - - # Extract information we want to present later in a table - return( - tibble::tibble( - "Dataset name" = dataset, - "Kappa" = round(confusion_result$overall["Kappa"], 3), - "Balanced accuracy" = round(confusion_result$byClass["Balanced Accuracy"], 3) - ) + print(glue::glue("======================== {dataset} ========================")) + + cat("Table of consensus class counts:") + print(table(df$consensus_class)) + + cat("\n\n") + + # run confusion matrix only if there are both singlets and doublets + if ("singlet" %in% df$consensus_call & "doublet" %in% df$consensus_call) { + confusion_result <- caret::confusionMatrix( + # truth should be first + table( + "Truth" = df$ground_truth, + "Consensus prediction" = df$consensus_call + ), + positive = "doublet" + ) + + print(confusion_result) + + # Extract information we want to present later in a table + return( + tibble::tibble( + "Dataset name" = dataset, + "Kappa" = round(confusion_result$overall["Kappa"], 3), + "Balanced accuracy" = round(confusion_result$byClass["Balanced Accuracy"], 3) ) - } + ) + } } ) |> dplyr::bind_rows() @@ -426,7 +428,6 @@ Since there are only two methods here, we will create two columns for this data: doublet_df_list <- doublet_df_list |> purrr::map( \(dataset_df) { - dataset_df <- dataset_df |> dplyr::rowwise() |> # Update column `consensus_call` @@ -444,7 +445,7 @@ doublet_df_list <- doublet_df_list |> all(c(scdbl_prediction, cxds_prediction) == "singlet") ~ "singlet", .default = "ambiguous" ) - ) |> + ) |> # Add Update `confusion_call` dplyr::mutate( confusion_call = dplyr::case_when( @@ -475,15 +476,16 @@ This section plots the PCA for each dataset, clockwise from the top left: ```{r, fig.width = 12, fig.height = 6} -pc_color_columns <- c("scDblFinder prediction" = "scdbl_prediction", - "cxds prediction" = "cxds_prediction", - "Ground truth" = "ground_truth", - "Consensus call" = "consensus_call") +pc_color_columns <- c( + "scDblFinder prediction" = "scdbl_prediction", + "cxds prediction" = "cxds_prediction", + "Ground truth" = "ground_truth", + "Consensus call" = "consensus_call" +) doublet_df_list |> purrr::imap( \(df, dataset) { - pc_plot_list <- pc_color_columns |> purrr::imap( \(color_column, plot_title) { @@ -492,7 +494,8 @@ doublet_df_list |> color_column = !!sym(color_column), plot_type = "calls", title = plot_title - ) + theme(legend.position = "none")} + ) + theme(legend.position = "none") + } ) # plot for consensus_class, which needs three colors # note that the list name isn't actually used here @@ -511,8 +514,9 @@ doublet_df_list |> patchwork::wrap_plots(pc_plot_list) + plot_annotation( - glue::glue("PCA for {dataset}"), - theme = theme(plot.title = element_text(size = 16))) + + glue::glue("PCA for {dataset}"), + theme = theme(plot.title = element_text(size = 16)) + ) + plot_layout(guides = "collect") } ) @@ -527,35 +531,35 @@ This section shows a table of the consensus class counts and calculates a confus metric_df <- doublet_df_list |> purrr::imap( \(df, dataset) { - print(glue::glue("======================== {dataset} ========================")) - - cat("Table of consensus class counts:") - print(table(df$consensus_class)) - - cat("\n\n") - - # run confusion matrix only if there are both singlets and doublets - if ("singlet" %in% df$consensus_call & "doublet" %in% df$consensus_call) { - confusion_result <- caret::confusionMatrix( - # truth should be first - table( - "Truth" = df$ground_truth, - "Consensus prediction" = df$consensus_call - ), - positive = "doublet" - ) - - print(confusion_result) - - # Extract information we want to present later in a table - return( - tibble::tibble( - "Dataset name" = dataset, - "Kappa" = round(confusion_result$overall["Kappa"], 3), - "Balanced accuracy" = round(confusion_result$byClass["Balanced Accuracy"], 3) - ) + print(glue::glue("======================== {dataset} ========================")) + + cat("Table of consensus class counts:") + print(table(df$consensus_class)) + + cat("\n\n") + + # run confusion matrix only if there are both singlets and doublets + if ("singlet" %in% df$consensus_call & "doublet" %in% df$consensus_call) { + confusion_result <- caret::confusionMatrix( + # truth should be first + table( + "Truth" = df$ground_truth, + "Consensus prediction" = df$consensus_call + ), + positive = "doublet" + ) + + print(confusion_result) + + # Extract information we want to present later in a table + return( + tibble::tibble( + "Dataset name" = dataset, + "Kappa" = round(confusion_result$overall["Kappa"], 3), + "Balanced accuracy" = round(confusion_result$byClass["Balanced Accuracy"], 3) ) - } + ) + } } ) |> dplyr::bind_rows() diff --git a/analyses/doublet-detection/template-notebooks/02_explore-benchmark-results.Rmd b/analyses/doublet-detection/template-notebooks/02_explore-benchmark-results.Rmd index 9cdf52806..44ae121b4 100644 --- a/analyses/doublet-detection/template-notebooks/02_explore-benchmark-results.Rmd +++ b/analyses/doublet-detection/template-notebooks/02_explore-benchmark-results.Rmd @@ -47,8 +47,6 @@ data_desc |> print() ``` - - ## Setup ### Packages @@ -97,7 +95,6 @@ sce_file <- file.path( data_dir, glue::glue("{params$dataset}.rds") ) - ``` ### Functions @@ -111,8 +108,8 @@ print_doublet_table <- function(df, pred_col, is_ground_truth = FALSE) { # df is expected to contain `pred_col`, should should be provided as a string # If there are no predicted doublets, make sure the tab has a 0 count - dub_tab <- table( factor(df[[pred_col]], levels = c("singlet", "doublet") )) - dub_percent <- round(dub_tab[["doublet"]]/sum(dub_tab), 4) * 100 + dub_tab <- table(factor(df[[pred_col]], levels = c("singlet", "doublet"))) + dub_percent <- round(dub_tab[["doublet"]] / sum(dub_tab), 4) * 100 if (is_ground_truth) { output <- glue::glue("{dub_percent}% of droplets in this dataset are doublets.") @@ -138,31 +135,33 @@ plot_pca <- function(df, # By default, this function assumes a discrete color variable (`color_type = "d"`) with two levels. Use `color_type = "c"` for continuous. p <- ggplot(df) + - aes(x = PC1, - y = PC2, - color = {{color_column}}) + - geom_point(alpha = 0.5) + - labs(color = color_lab) + - theme( - legend.title.position = "top", - legend.position = "bottom" - ) + aes( + x = PC1, + y = PC2, + color = {{ color_column }} + ) + + geom_point(alpha = 0.5) + + labs(color = color_lab) + + theme( + legend.title.position = "top", + legend.position = "bottom" + ) # Set the palette, and ensure doublets are on top for visibility if (color_type == "d") { p <- p + scale_color_manual(values = c("black", "lightblue")) + - geom_point( - data = dplyr::filter(df, {{color_column}} == "doublet"), - color = "black" - ) + geom_point( + data = dplyr::filter(df, {{ color_column }} == "doublet"), + color = "black" + ) } else if (color_type == "c") { p <- p + scale_color_viridis_c(direction = -1) + - geom_point( - data = dplyr::filter(df, {{pred_column}} == "doublet"), - aes(color = {{color_column}}) - ) + geom_point( + data = dplyr::filter(df, {{ pred_column }} == "doublet"), + aes(color = {{ color_column }}) + ) } return(p) @@ -170,14 +169,16 @@ plot_pca <- function(df, -plot_jitter_scores <- function(df, score_col, pred_col, method_name){ +plot_jitter_scores <- function(df, score_col, pred_col, method_name) { # Plot jitter plot of score distributions and predictions, colored by ground truth # df is expected to contain columns `ground_truth`, `score_col`, and `pred_col` # The column arguments should _not_ be provided as strings ggplot(df) + - aes(x = ground_truth, - y = {{score_col}}, - color = {{pred_col}}) + + aes( + x = ground_truth, + y = {{ score_col }}, + color = {{ pred_col }} + ) + geom_jitter( alpha = 0.2, width = 0.1 @@ -187,29 +188,31 @@ plot_jitter_scores <- function(df, score_col, pred_col, method_name){ y = glue::glue("{method_name} score"), color = glue::glue("{method_name} prediction") ) + - theme( - legend.title.position = "top", - legend.position = "bottom" - ) + theme( + legend.title.position = "top", + legend.position = "bottom" + ) } -plot_density_scores <- function(df, score_col, method_name){ +plot_density_scores <- function(df, score_col, method_name) { # Plot density plot of score distributions, colored by ground truth # df is expected to contain columns `ground_truth` and `score_col` # The column arguments should _not_ be provided as strings ggplot(df) + - aes(x = {{score_col}}, - fill = ground_truth) + + aes( + x = {{ score_col }}, + fill = ground_truth + ) + geom_density(alpha = 0.7) + labs( x = glue::glue("{method_name} score"), fill = "Ground truth call" ) + - theme( - legend.title.position = "top", - legend.position = "bottom" - ) + theme( + legend.title.position = "top", + legend.position = "bottom" + ) } assess_doublets <- function(df, pred_col) { @@ -217,26 +220,24 @@ assess_doublets <- function(df, pred_col) { # The `pred_col` argument _should_ be provided as a strings # This function will return NULL if there are insufficient counts to calculate metrics - # truth should be first - confusion_table <- table( - "Truth" = df$ground_truth, - "Prediction" = df[[pred_col]] - ) + # truth should be first + confusion_table <- table( + "Truth" = df$ground_truth, + "Prediction" = df[[pred_col]] + ) - if (!all(dim(confusion_table) == c(2,2))) { - print("Metrics could not be calculated; insufficient counts.") - return(NULL) - } else { - return( - caret::confusionMatrix( - confusion_table, - positive = "doublet" - ) + if (!all(dim(confusion_table) == c(2, 2))) { + print("Metrics could not be calculated; insufficient counts.") + return(NULL) + } else { + return( + caret::confusionMatrix( + confusion_table, + positive = "doublet" ) - } - + ) + } } - ``` @@ -250,7 +251,7 @@ scdbl_df <- read_tsv(scdbl_tsv) |> barcodes, cxds_score, scdbl_score = score, - scdbl_prediction = class + scdbl_prediction = class ) |> # add cxds calls at three thresholds: # 0.5, 0.75, 0.9 @@ -279,9 +280,10 @@ sce <- read_rds(sce_file) sce_df <- scuttle::makePerCellDF(sce, use.dimred = "PCA") |> tibble::rownames_to_column(var = "barcodes") |> dplyr::select(barcodes, - ground_truth = ground_truth_doublets, - PC1 = PCA.1, - PC2 = PCA.2) + ground_truth = ground_truth_doublets, + PC1 = PCA.1, + PC2 = PCA.2 + ) # we can now remove the sce rm(sce) @@ -360,7 +362,6 @@ plot_density_scores( #### PCA plots ```{r, eval = has_scdbl, fig.width = 8} - d_plot <- plot_pca( doublet_df, color_column = scdbl_prediction, @@ -486,7 +487,6 @@ plot_density_scores( #### PCA plots ```{r fig.width = 10} - cxds_cols <- names(doublet_df)[grep("^cxds_prediction_", names(doublet_df))] prediction_pcas <- cxds_cols |> @@ -499,9 +499,9 @@ prediction_pcas <- cxds_cols |> color_column = !!as.symbol(col_name), color_lab = glue::glue("cxds prediction at {threshold}") ) + - labs(title = glue::glue("cxds prediction at {threshold}")) + labs(title = glue::glue("cxds prediction at {threshold}")) } -) + ) wrap_plots(prediction_pcas) ``` From 162d4d5146b59616e4048772de18816fe1f280ce Mon Sep 17 00:00:00 2001 From: Stephanie Spielman Date: Tue, 23 Jul 2024 12:57:19 -0400 Subject: [PATCH 09/10] Apply suggestions from code review Co-authored-by: Joshua Shapiro --- .../exploratory-notebooks/03_compare-benchmark-results.Rmd | 3 ++- .../template-notebooks/02_explore-benchmark-results.Rmd | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/analyses/doublet-detection/exploratory-notebooks/03_compare-benchmark-results.Rmd b/analyses/doublet-detection/exploratory-notebooks/03_compare-benchmark-results.Rmd index 9b578b15c..8c7d9b8bd 100644 --- a/analyses/doublet-detection/exploratory-notebooks/03_compare-benchmark-results.Rmd +++ b/analyses/doublet-detection/exploratory-notebooks/03_compare-benchmark-results.Rmd @@ -193,7 +193,8 @@ doublet_df_list <- dataset_names |> scuttle::makePerCellDF(sce, use.dimred = "PCA") |> tibble::rownames_to_column(var = "barcodes") |> - dplyr::select(barcodes, + dplyr::select( + barcodes, ground_truth = ground_truth_doublets, PC1 = PCA.1, PC2 = PCA.2 diff --git a/analyses/doublet-detection/template-notebooks/02_explore-benchmark-results.Rmd b/analyses/doublet-detection/template-notebooks/02_explore-benchmark-results.Rmd index 44ae121b4..ca450806e 100644 --- a/analyses/doublet-detection/template-notebooks/02_explore-benchmark-results.Rmd +++ b/analyses/doublet-detection/template-notebooks/02_explore-benchmark-results.Rmd @@ -279,7 +279,8 @@ scrub_df <- read_tsv(scrub_tsv) sce <- read_rds(sce_file) sce_df <- scuttle::makePerCellDF(sce, use.dimred = "PCA") |> tibble::rownames_to_column(var = "barcodes") |> - dplyr::select(barcodes, + dplyr::select( + barcodes, ground_truth = ground_truth_doublets, PC1 = PCA.1, PC2 = PCA.2 From 3a6204096fdccc967bada16ce6f0a92b53dc590c Mon Sep 17 00:00:00 2001 From: Ally Hawkins Date: Tue, 23 Jul 2024 12:37:16 -0500 Subject: [PATCH 10/10] use patchwork --- .../scripts/utils/tumor-validation-helpers.R | 4 +++- .../singler-workflow/01-singler-results.Rmd | 20 +++++++++++-------- 2 files changed, 15 insertions(+), 9 deletions(-) diff --git a/analyses/cell-type-ewings/scripts/utils/tumor-validation-helpers.R b/analyses/cell-type-ewings/scripts/utils/tumor-validation-helpers.R index 84dda4b95..9abfaabc6 100644 --- a/analyses/cell-type-ewings/scripts/utils/tumor-validation-helpers.R +++ b/analyses/cell-type-ewings/scripts/utils/tumor-validation-helpers.R @@ -291,7 +291,7 @@ plot_density <- function(classification_df, # pull out gene set name to create the plot title geneset_name <- stringr::str_remove(gene_exp_column, "_sum|mean-") - ggplot(classification_df) + + plot <- ggplot(classification_df) + aes(x = !!sym(gene_exp_column)) + # marker gene set column geom_density( fill = "grey65", @@ -320,6 +320,8 @@ plot_density <- function(classification_df, panel.grid.major.y = element_blank(), panel.spacing = unit(0.02, "in") ) + + return(plot) } diff --git a/analyses/cell-type-ewings/template_notebooks/singler-workflow/01-singler-results.Rmd b/analyses/cell-type-ewings/template_notebooks/singler-workflow/01-singler-results.Rmd index cc5bd6909..86cb6ed84 100644 --- a/analyses/cell-type-ewings/template_notebooks/singler-workflow/01-singler-results.Rmd +++ b/analyses/cell-type-ewings/template_notebooks/singler-workflow/01-singler-results.Rmd @@ -225,13 +225,14 @@ plot_density(classification_df, full_celltype_heatmap(classification_df, geneset_columns, "singler_tumor_normal") ``` -```{r} +```{r, fig.height=10} geneset_columns |> purrr::map(\(column){ plot_density(classification_df, column, "singler_tumor_normal") - }) + }) |> + patchwork::wrap_plots(ncol = 1) ``` @@ -245,13 +246,14 @@ In this section we show all tumor cells and the top 5 most represented normal ce full_celltype_heatmap(classification_df, marker_gene_columns, "singler_lumped") ``` -```{r} +```{r, fig.height=10} marker_gene_columns |> purrr::map(\(column){ plot_density(classification_df, column, "singler_lumped") - }) + }) |> + patchwork::wrap_plots(ncol = 1) ``` @@ -261,13 +263,14 @@ marker_gene_columns |> full_celltype_heatmap(classification_df, geneset_columns, "singler_lumped") ``` -```{r} +```{r, fig.height=10} geneset_columns |> purrr::map(\(column){ plot_density(classification_df, column, "singler_lumped") - }) + }) |> + patchwork::wrap_plots(ncol = 1) ``` ### Tumor cells annotatd by `SingleR` and `AUCell` @@ -294,13 +297,14 @@ plot_density(classification_df, full_celltype_heatmap(classification_df, geneset_columns, "consensus") ``` -```{r} +```{r, fig.height=10} geneset_columns |> purrr::map(\(column){ plot_density(classification_df, column, "consensus") - }) + }) |> + patchwork::wrap_plots(ncol = 1) ```