Merge pull request #307 from stemangiola/add-solve-confounders

add method
stemangiola · Jul 7, 2024 · 9c9edcc · 9c9edcc
2 parents 5f16b45 + cd6a0fd
commit 9c9edcc
Show file tree

Hide file tree

Showing 9 changed files with 327 additions and 35 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Type: Package
 Package: tidybulk
 Title: Brings transcriptomics to the tidyverse 
-Version: 1.17.2
+Version: 1.17.3
 Authors@R: c(person("Stefano", "Mangiola", email = "[email protected]",
                   role = c("aut", "cre")),
             person("Maria", "Doyle", email = "[email protected]",

diff --git a/NAMESPACE b/NAMESPACE
@@ -53,6 +53,7 @@ export(quantile_normalise_abundance)
 export(reduce_dimensions)
 export(remove_redundancy)
 export(rename)
+export(resolve_complete_confounders_of_non_interest)
 export(right_join)
 export(rotate_dimensions)
 export(rowwise)
@@ -141,9 +142,11 @@ importFrom(purrr,map2_dfc)
 importFrom(purrr,map2_dfr)
 importFrom(purrr,map_chr)
 importFrom(purrr,map_dfr)
+importFrom(purrr,map_int)
 importFrom(purrr,map_lgl)
 importFrom(purrr,reduce)
 importFrom(purrr,when)
+importFrom(rlang,"!!")
 importFrom(rlang,":=")
 importFrom(rlang,dots_list)
 importFrom(rlang,dots_values)
@@ -159,6 +162,7 @@ importFrom(rlang,quo_is_symbol)
 importFrom(rlang,quo_is_symbolic)
 importFrom(rlang,quo_name)
 importFrom(rlang,quo_squash)
+importFrom(rlang,set_names)
 importFrom(scales,extended_breaks)
 importFrom(scales,label_scientific)
 importFrom(scales,log_breaks)

diff --git a/R/functions.R b/R/functions.R
@@ -465,26 +465,28 @@ get_differential_transcript_abundance_bulk <- function(.data,
 			data = df_for_edgeR %>% select(!!.sample, any_of(parse_formula(.formula))) %>% distinct %>% arrange(!!.sample)
 		)
 
+	# # Print the design column names in case I want contrasts
+	# message(
+	# 	sprintf(
+	# 		"tidybulk says: The design column names are \"%s\"",
+	# 		design %>% colnames %>% paste(collapse = ", ")
+	# 	)
+	# )
+
 	# Replace `:` with ___ because it creates error with edgeR
 	if(design |> colnames() |> str_detect(":") |> any()) {
 	  message("tidybulk says: the interaction term `:` has been replaced with `___` in the design matrix, in order to work with edgeR.")
 	  colnames(design) = design |> colnames() |> str_replace(":", "___") 
 	}
 
-	# Print the design column names in case I want contrasts
-	message(
-		sprintf(
-			"tidybulk says: The design column names are \"%s\"",
-			design %>% colnames %>% paste(collapse = ", ")
-		)
-	)
+
 
 	# Specify the design column tested
 	if(is.null(.contrasts))
 	  message(
 	    sprintf(
 	      "tidybulk says: The design column being tested is %s",
-	      design %>% colnames %>% .[1]
+	      design %>% colnames %>% .[2]
 	    )
 	  )
 
@@ -770,7 +772,7 @@ get_differential_transcript_abundance_glmmSeq <- function(.data,
       object = .formula |> lme4::nobars(),
       data = metadata
     )
-  
+
   if(quo_is_symbolic(.dispersion))
     dispersion = .data |> pivot_transcript(!!.transcript) |> select(!!.transcript, !!.dispersion) |> deframe()
   else
@@ -787,8 +789,8 @@ get_differential_transcript_abundance_glmmSeq <- function(.data,
 
   # Scaling
   sizeFactors <- counts |> edgeR::calcNormFactors(method = scaling_method)
-  
-  
+
+
   glmmSeq_object =
     glmmSeq( .formula,
           countdata = counts ,

diff --git a/R/functions_SE.R b/R/functions_SE.R
@@ -408,7 +408,7 @@ get_reduced_dimensions_TSNE_bulk_SE <-
 #' @param of_samples A boolean
 #' @param transform A function that will tranform the counts, by default it is log1p for RNA sequencing data, but for avoinding tranformation you can use identity
 #' @param calculate_for_pca_dimensions An integer of length one. The number of PCA dimensions to based the UMAP calculatio on. If NULL all variable features are considered
-#' @param ... Further parameters passed to the function uwot
+#' @param ... Further parameters passed to the function uwot::tumap
 #'
 #' @return A tibble with additional columns
 #'
@@ -1145,7 +1145,7 @@ get_differential_transcript_abundance_glmmSeq_SE <- function(.data,
       object = .formula |> lme4::nobars(),
       data = metadata
     )
-  
+
   if(quo_is_symbolic(.dispersion))
     dispersion = rowData(.data)[,quo_name(.dispersion),drop=FALSE] |> as_tibble(rownames = feature__$name) |> deframe()
   else
@@ -1162,8 +1162,8 @@ get_differential_transcript_abundance_glmmSeq_SE <- function(.data,
 
   # Scaling
   sizeFactors <- counts |> edgeR::calcNormFactors(method = scaling_method)
-  
-  
+
+
   glmmSeq_object =
     glmmSeq( .formula,
                       countdata = counts ,
@@ -1496,3 +1496,86 @@ univariable_differential_tissue_composition_SE = function(
 
 		unnest(surv_test, keep_empty = TRUE)
 }
+
+
+#' Resolve Complete Confounders of Non-Interest
+#'
+#' This function processes a SummarizedExperiment object to handle confounders
+#' that are not of interest in the analysis. It deals with two factors, adjusting
+#' the data by nesting and summarizing over these factors.
+#'
+#' @importFrom rlang enquo
+#' @importFrom rlang quo_name
+#' @importFrom rlang !!
+#' @importFrom tidyr unnest
+#' @importFrom tidyr nest
+#' @importFrom dplyr mutate
+#' @importFrom dplyr arrange
+#' @importFrom dplyr slice
+#' @importFrom dplyr pull
+#' @importFrom dplyr select
+#' @importFrom purrr map_int
+#' @importFrom dplyr if_else
+#'
+#' @param se A SummarizedExperiment object that contains the data to be processed.
+#' @param .factor_1 A symbol or quosure representing the first factor variable in `se`.
+#' @param .factor_2 A symbol or quosure representing the second factor variable in `se`.
+#' @return A modified SummarizedExperiment object with confounders resolved.
+#' @examples
+#' # Not run:
+#' # se is a SummarizedExperiment object
+#' resolve_complete_confounders_of_non_interest(se, .factor_1 = factor1, .factor_2 = factor2)
+#' @noRd
+resolve_complete_confounders_of_non_interest_pair_SE <- function(se, .factor_1, .factor_2){
+
+  .factor_1 <- enquo(.factor_1)
+  .factor_2 <- enquo(.factor_2)
+
+  cd =
+    colData(se) |>
+    as_tibble() |>
+    rowid_to_column() |>
+    distinct(rowid, !!.factor_1, !!.factor_2) |>
+
+    nest(se_data = -c(!!.factor_1, !!.factor_2)) |>
+    nest(data = -!!.factor_1) |>
+    mutate(n1 = map_int(data, ~ .x |> distinct(!!.factor_2) |> nrow())) |>
+    unnest(data) |>
+
+    # Nest data excluding .factor_2 and count distinct .factor_1 values
+    nest(data = -!!.factor_2) |>
+    mutate(n2 = map_int(data, ~ .x |> distinct(!!.factor_1) |> nrow())) |>
+    unnest(data)
+
+  # Choose a dummy value for .factor_2 based on sorting by n1 + n2
+  dummy_factor_2 <- cd |> arrange(desc(n1 + n2)) |> slice(1) |> pull(!!.factor_2)
+
+  # Messages if I have confounders
+  if(cd |> filter(n1 + n2 < 3) |> nrow() > 0){
+
+    message(sprintf("tidybulk says: IMPORTANT! the columns %s and %s, have been corrected for complete confounders and now are NOT interpretable, and cannot be used in hypothesis testing.", quo_name(.factor_1), quo_name(.factor_2)))
+
+    message(sprintf(
+      "tidybulk says: The value(s) %s in column %s from sample(s) %s, has been changed to %s.",
+      cd |> filter(n1 + n2 < 3) |> pull(!!.factor_2),
+      quo_name(.factor_2),
+      cd |> filter(n1 + n2 < 3) |> pull(se_data) |> map(colnames) |> unlist(),
+      dummy_factor_2
+    ))
+
+    # Replace .factor_2 with dummy_factor_2 where n1 + n2 is less than 3 and unnest
+    cd = cd |>
+      mutate(!!.factor_2 := if_else(n1 + n2 < 3, dummy_factor_2, !!.factor_2))
+  }
+
+  colData(se)[,c(quo_name(.factor_1), quo_name(.factor_2))] =
+    cd |>
+    unnest(se_data) |>
+    arrange(rowid) |>
+    select(!!.factor_1, !!.factor_2) |>
+    DataFrame()
+
+  se
+
+}
+
diff --git a/R/methods.R b/R/methods.R
@@ -1025,7 +1025,7 @@ setMethod("cluster_elements", "tidybulk", .cluster_elements)
 #' @param transform A function that will tranform the counts, by default it is log1p for RNA sequencing data, but for avoinding tranformation you can use identity
 #' @param scale A boolean for method="PCA", this will be passed to the `prcomp` function. It is not included in the ... argument because although the default for `prcomp` if FALSE, it is advisable to set it as TRUE.
 #' @param action A character string. Whether to join the new information to the input tbl (add), or just get the non-redundant tbl with the new information (get).
-#' @param ... Further parameters passed to the function prcomp if you choose method="PCA" or Rtsne if you choose method="tSNE"
+#' @param ... Further parameters passed to the function prcomp if you choose method="PCA" or Rtsne if you choose method="tSNE", or uwot::tumap if you choose method="umap"
 #'
 #' @param log_transform DEPRECATED - A boolean, whether the value should be log-transformed (e.g., TRUE for RNA sequencing data)
 #'
@@ -1154,14 +1154,14 @@ setGeneric("reduce_dimensions", function(.data,
 	# adjust top for the max number of features I have
 	if(top > .data |> distinct(!!.feature) |> nrow()){
 	  warning(sprintf(
-	    "tidybulk says: the \"top\" argument %s is higher than the number of features %s", 
-	    top, 
+	    "tidybulk says: the \"top\" argument %s is higher than the number of features %s",
+	    top,
 	    .data |> distinct(!!.feature) |> nrow()
 	  ))
-	  
+
 	  top = min(top, .data |> distinct(!!.feature) |> nrow())
 	}
-	
+
 	# Validate data frame
 	if(do_validate()) {
 	validation(.data, !!.element, !!.feature, !!.abundance)
@@ -2611,7 +2611,7 @@ setMethod("ensembl_to_symbol", "tidybulk", .ensembl_to_symbol)
 #' All methods use raw counts, irrespective of if scale_abundance or adjust_abundance have been calculated, therefore it is essential to add covariates such as batch effects (if applicable) in the formula.
 #'
 #' Underlying method for edgeR framework:
-#' 
+#'
 #' 	.data |>
 #'
 #' 	# Filter
@@ -2638,7 +2638,7 @@ setMethod("ensembl_to_symbol", "tidybulk", .ensembl_to_symbol)
 #'
 #'
 #'	Underlying method for DESeq2 framework:
-#'	
+#'
 #'	keep_abundant(
 #'			factor_of_interest = !!as.symbol(parse_formula(.formula)[[1]]),
 #'			minimum_counts = minimum_counts,
@@ -2657,25 +2657,25 @@ setMethod("ensembl_to_symbol", "tidybulk", .ensembl_to_symbol)
 #' counts =
 #' .data %>%
 #'   assay(my_assay)
-#' 
+#'
 #' # Create design matrix for dispersion, removing random effects
 #' design =
 #'   model.matrix(
 #'     object = .formula |> lme4::nobars(),
 #'     data = metadata
 #'   )
-#' 
+#'
 #' dispersion = counts |> edgeR::estimateDisp(design = design) %$% tagwise.dispersion |> setNames(rownames(counts))
-#' 
+#'
 #'   glmmSeq( .formula,
 #'            countdata = counts ,
 #'            metadata =   metadata |> as.data.frame(),
 #'            dispersion = dispersion,
 #'            progress = TRUE,
 #'            method = method |> str_remove("(?i)^glmmSeq_" ),
 #'   )
-#'   
-#' 
+#'
+#'
 #' @return A consistent object (to the input) with additional columns for the statistics from the test (e.g.,  log fold change, p-value and false discovery rate).
 #'
 #'
@@ -3980,12 +3980,12 @@ setGeneric("test_gene_rank", function(.data,
 
 	# DEPRECATION OF reference function
 	if (is_present(.sample) & !is.null(.sample)) {
-	  
+
 	  # Signal the deprecation to the user
 	  deprecate_warn("1.13.2", "tidybulk::test_gene_rank(.sample = )", details = "The argument .sample is now deprecated and not needed anymore.")
 
 	}
-	
+
 	# Get column names
 	.arrange_desc = enquo(.arrange_desc)
 	.entrez = enquo(.entrez)
@@ -4023,14 +4023,14 @@ setGeneric("test_gene_rank", function(.data,
 
 	.data |>
 		select(!!.entrez, !!.arrange_desc) |>
-	  distinct() |> 
-	  
+	  distinct() |>
+
 	  # Select one entrez - NEEDED?
-	  with_groups(c(!!.entrez,!!.arrange_desc ), slice, 1) |> 
+	  with_groups(c(!!.entrez,!!.arrange_desc ), slice, 1) |>
 
-	  # arrange 
+	  # arrange
 	  arrange(desc(!!.arrange_desc)) |>
-	  
+
 	  # Format
 		deframe() |>
 		entrez_rank_to_gsea(species, gene_collections  = gene_sets ) |>
@@ -4967,3 +4967,30 @@ as_matrix <- function(tbl,
 }
 
 
+#' Resolve Complete Confounders of Non-Interest
+#'
+#' This generic function processes a SummarizedExperiment object to handle confounders
+#' that are not of interest in the analysis. It dynamically handles combinations
+#' of provided factors, adjusting the data by nesting and summarizing over these factors.
+#'
+#'
+#' @param se A SummarizedExperiment object that contains the data to be processed.
+#' @param ... Arbitrary number of factor variables represented as symbols or quosures
+#'            to be considered for resolving confounders. These factors are processed
+#'            in combinations of two.
+#'
+#' @rdname resolve_complete_confounders_of_non_interest-methods
+#'
+#' @return A modified SummarizedExperiment object with confounders resolved.
+#'
+#' @examples
+#' # Not run:
+#' # se is a SummarizedExperiment object
+#' # resolve_complete_confounders_of_non_interest(se, factor1, factor2, factor3)
+#' @export
+setGeneric("resolve_complete_confounders_of_non_interest", function(se, ...) {
+  standardGeneric("resolve_complete_confounders_of_non_interest")
+})
+
+
+