Skip to content

Commit

Permalink
Merge pull request #307 from stemangiola/add-solve-confounders
Browse files Browse the repository at this point in the history
add method
  • Loading branch information
stemangiola authored Jul 7, 2024
2 parents 5f16b45 + cd6a0fd commit 9c9edcc
Show file tree
Hide file tree
Showing 9 changed files with 327 additions and 35 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Type: Package
Package: tidybulk
Title: Brings transcriptomics to the tidyverse
Version: 1.17.2
Version: 1.17.3
Authors@R: c(person("Stefano", "Mangiola", email = "[email protected]",
role = c("aut", "cre")),
person("Maria", "Doyle", email = "[email protected]",
Expand Down
4 changes: 4 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ export(quantile_normalise_abundance)
export(reduce_dimensions)
export(remove_redundancy)
export(rename)
export(resolve_complete_confounders_of_non_interest)
export(right_join)
export(rotate_dimensions)
export(rowwise)
Expand Down Expand Up @@ -141,9 +142,11 @@ importFrom(purrr,map2_dfc)
importFrom(purrr,map2_dfr)
importFrom(purrr,map_chr)
importFrom(purrr,map_dfr)
importFrom(purrr,map_int)
importFrom(purrr,map_lgl)
importFrom(purrr,reduce)
importFrom(purrr,when)
importFrom(rlang,"!!")
importFrom(rlang,":=")
importFrom(rlang,dots_list)
importFrom(rlang,dots_values)
Expand All @@ -159,6 +162,7 @@ importFrom(rlang,quo_is_symbol)
importFrom(rlang,quo_is_symbolic)
importFrom(rlang,quo_name)
importFrom(rlang,quo_squash)
importFrom(rlang,set_names)
importFrom(scales,extended_breaks)
importFrom(scales,label_scientific)
importFrom(scales,log_breaks)
Expand Down
24 changes: 13 additions & 11 deletions R/functions.R
Original file line number Diff line number Diff line change
Expand Up @@ -465,26 +465,28 @@ get_differential_transcript_abundance_bulk <- function(.data,
data = df_for_edgeR %>% select(!!.sample, any_of(parse_formula(.formula))) %>% distinct %>% arrange(!!.sample)
)

# # Print the design column names in case I want contrasts
# message(
# sprintf(
# "tidybulk says: The design column names are \"%s\"",
# design %>% colnames %>% paste(collapse = ", ")
# )
# )

# Replace `:` with ___ because it creates error with edgeR
if(design |> colnames() |> str_detect(":") |> any()) {
message("tidybulk says: the interaction term `:` has been replaced with `___` in the design matrix, in order to work with edgeR.")
colnames(design) = design |> colnames() |> str_replace(":", "___")
}

# Print the design column names in case I want contrasts
message(
sprintf(
"tidybulk says: The design column names are \"%s\"",
design %>% colnames %>% paste(collapse = ", ")
)
)


# Specify the design column tested
if(is.null(.contrasts))
message(
sprintf(
"tidybulk says: The design column being tested is %s",
design %>% colnames %>% .[1]
design %>% colnames %>% .[2]
)
)

Expand Down Expand Up @@ -770,7 +772,7 @@ get_differential_transcript_abundance_glmmSeq <- function(.data,
object = .formula |> lme4::nobars(),
data = metadata
)

if(quo_is_symbolic(.dispersion))
dispersion = .data |> pivot_transcript(!!.transcript) |> select(!!.transcript, !!.dispersion) |> deframe()
else
Expand All @@ -787,8 +789,8 @@ get_differential_transcript_abundance_glmmSeq <- function(.data,

# Scaling
sizeFactors <- counts |> edgeR::calcNormFactors(method = scaling_method)


glmmSeq_object =
glmmSeq( .formula,
countdata = counts ,
Expand Down
91 changes: 87 additions & 4 deletions R/functions_SE.R
Original file line number Diff line number Diff line change
Expand Up @@ -408,7 +408,7 @@ get_reduced_dimensions_TSNE_bulk_SE <-
#' @param of_samples A boolean
#' @param transform A function that will tranform the counts, by default it is log1p for RNA sequencing data, but for avoinding tranformation you can use identity
#' @param calculate_for_pca_dimensions An integer of length one. The number of PCA dimensions to based the UMAP calculatio on. If NULL all variable features are considered
#' @param ... Further parameters passed to the function uwot
#' @param ... Further parameters passed to the function uwot::tumap
#'
#' @return A tibble with additional columns
#'
Expand Down Expand Up @@ -1145,7 +1145,7 @@ get_differential_transcript_abundance_glmmSeq_SE <- function(.data,
object = .formula |> lme4::nobars(),
data = metadata
)

if(quo_is_symbolic(.dispersion))
dispersion = rowData(.data)[,quo_name(.dispersion),drop=FALSE] |> as_tibble(rownames = feature__$name) |> deframe()
else
Expand All @@ -1162,8 +1162,8 @@ get_differential_transcript_abundance_glmmSeq_SE <- function(.data,

# Scaling
sizeFactors <- counts |> edgeR::calcNormFactors(method = scaling_method)


glmmSeq_object =
glmmSeq( .formula,
countdata = counts ,
Expand Down Expand Up @@ -1496,3 +1496,86 @@ univariable_differential_tissue_composition_SE = function(

unnest(surv_test, keep_empty = TRUE)
}


#' Resolve Complete Confounders of Non-Interest
#'
#' This function processes a SummarizedExperiment object to handle confounders
#' that are not of interest in the analysis. It deals with two factors, adjusting
#' the data by nesting and summarizing over these factors.
#'
#' @importFrom rlang enquo
#' @importFrom rlang quo_name
#' @importFrom rlang !!
#' @importFrom tidyr unnest
#' @importFrom tidyr nest
#' @importFrom dplyr mutate
#' @importFrom dplyr arrange
#' @importFrom dplyr slice
#' @importFrom dplyr pull
#' @importFrom dplyr select
#' @importFrom purrr map_int
#' @importFrom dplyr if_else
#'
#' @param se A SummarizedExperiment object that contains the data to be processed.
#' @param .factor_1 A symbol or quosure representing the first factor variable in `se`.
#' @param .factor_2 A symbol or quosure representing the second factor variable in `se`.
#' @return A modified SummarizedExperiment object with confounders resolved.
#' @examples
#' # Not run:
#' # se is a SummarizedExperiment object
#' resolve_complete_confounders_of_non_interest(se, .factor_1 = factor1, .factor_2 = factor2)
#' @noRd
resolve_complete_confounders_of_non_interest_pair_SE <- function(se, .factor_1, .factor_2){

.factor_1 <- enquo(.factor_1)
.factor_2 <- enquo(.factor_2)

cd =
colData(se) |>
as_tibble() |>
rowid_to_column() |>
distinct(rowid, !!.factor_1, !!.factor_2) |>

nest(se_data = -c(!!.factor_1, !!.factor_2)) |>
nest(data = -!!.factor_1) |>
mutate(n1 = map_int(data, ~ .x |> distinct(!!.factor_2) |> nrow())) |>
unnest(data) |>

# Nest data excluding .factor_2 and count distinct .factor_1 values
nest(data = -!!.factor_2) |>
mutate(n2 = map_int(data, ~ .x |> distinct(!!.factor_1) |> nrow())) |>
unnest(data)

# Choose a dummy value for .factor_2 based on sorting by n1 + n2
dummy_factor_2 <- cd |> arrange(desc(n1 + n2)) |> slice(1) |> pull(!!.factor_2)

# Messages if I have confounders
if(cd |> filter(n1 + n2 < 3) |> nrow() > 0){

message(sprintf("tidybulk says: IMPORTANT! the columns %s and %s, have been corrected for complete confounders and now are NOT interpretable, and cannot be used in hypothesis testing.", quo_name(.factor_1), quo_name(.factor_2)))

message(sprintf(
"tidybulk says: The value(s) %s in column %s from sample(s) %s, has been changed to %s.",
cd |> filter(n1 + n2 < 3) |> pull(!!.factor_2),
quo_name(.factor_2),
cd |> filter(n1 + n2 < 3) |> pull(se_data) |> map(colnames) |> unlist(),
dummy_factor_2
))

# Replace .factor_2 with dummy_factor_2 where n1 + n2 is less than 3 and unnest
cd = cd |>
mutate(!!.factor_2 := if_else(n1 + n2 < 3, dummy_factor_2, !!.factor_2))
}

colData(se)[,c(quo_name(.factor_1), quo_name(.factor_2))] =
cd |>
unnest(se_data) |>
arrange(rowid) |>
select(!!.factor_1, !!.factor_2) |>
DataFrame()

se

}

65 changes: 46 additions & 19 deletions R/methods.R
Original file line number Diff line number Diff line change
Expand Up @@ -1025,7 +1025,7 @@ setMethod("cluster_elements", "tidybulk", .cluster_elements)
#' @param transform A function that will tranform the counts, by default it is log1p for RNA sequencing data, but for avoinding tranformation you can use identity
#' @param scale A boolean for method="PCA", this will be passed to the `prcomp` function. It is not included in the ... argument because although the default for `prcomp` if FALSE, it is advisable to set it as TRUE.
#' @param action A character string. Whether to join the new information to the input tbl (add), or just get the non-redundant tbl with the new information (get).
#' @param ... Further parameters passed to the function prcomp if you choose method="PCA" or Rtsne if you choose method="tSNE"
#' @param ... Further parameters passed to the function prcomp if you choose method="PCA" or Rtsne if you choose method="tSNE", or uwot::tumap if you choose method="umap"
#'
#' @param log_transform DEPRECATED - A boolean, whether the value should be log-transformed (e.g., TRUE for RNA sequencing data)
#'
Expand Down Expand Up @@ -1154,14 +1154,14 @@ setGeneric("reduce_dimensions", function(.data,
# adjust top for the max number of features I have
if(top > .data |> distinct(!!.feature) |> nrow()){
warning(sprintf(
"tidybulk says: the \"top\" argument %s is higher than the number of features %s",
top,
"tidybulk says: the \"top\" argument %s is higher than the number of features %s",
top,
.data |> distinct(!!.feature) |> nrow()
))

top = min(top, .data |> distinct(!!.feature) |> nrow())
}

# Validate data frame
if(do_validate()) {
validation(.data, !!.element, !!.feature, !!.abundance)
Expand Down Expand Up @@ -2611,7 +2611,7 @@ setMethod("ensembl_to_symbol", "tidybulk", .ensembl_to_symbol)
#' All methods use raw counts, irrespective of if scale_abundance or adjust_abundance have been calculated, therefore it is essential to add covariates such as batch effects (if applicable) in the formula.
#'
#' Underlying method for edgeR framework:
#'
#'
#' .data |>
#'
#' # Filter
Expand All @@ -2638,7 +2638,7 @@ setMethod("ensembl_to_symbol", "tidybulk", .ensembl_to_symbol)
#'
#'
#' Underlying method for DESeq2 framework:
#'
#'
#' keep_abundant(
#' factor_of_interest = !!as.symbol(parse_formula(.formula)[[1]]),
#' minimum_counts = minimum_counts,
Expand All @@ -2657,25 +2657,25 @@ setMethod("ensembl_to_symbol", "tidybulk", .ensembl_to_symbol)
#' counts =
#' .data %>%
#' assay(my_assay)
#'
#'
#' # Create design matrix for dispersion, removing random effects
#' design =
#' model.matrix(
#' object = .formula |> lme4::nobars(),
#' data = metadata
#' )
#'
#'
#' dispersion = counts |> edgeR::estimateDisp(design = design) %$% tagwise.dispersion |> setNames(rownames(counts))
#'
#'
#' glmmSeq( .formula,
#' countdata = counts ,
#' metadata = metadata |> as.data.frame(),
#' dispersion = dispersion,
#' progress = TRUE,
#' method = method |> str_remove("(?i)^glmmSeq_" ),
#' )
#'
#'
#'
#'
#' @return A consistent object (to the input) with additional columns for the statistics from the test (e.g., log fold change, p-value and false discovery rate).
#'
#'
Expand Down Expand Up @@ -3980,12 +3980,12 @@ setGeneric("test_gene_rank", function(.data,

# DEPRECATION OF reference function
if (is_present(.sample) & !is.null(.sample)) {

# Signal the deprecation to the user
deprecate_warn("1.13.2", "tidybulk::test_gene_rank(.sample = )", details = "The argument .sample is now deprecated and not needed anymore.")

}

# Get column names
.arrange_desc = enquo(.arrange_desc)
.entrez = enquo(.entrez)
Expand Down Expand Up @@ -4023,14 +4023,14 @@ setGeneric("test_gene_rank", function(.data,

.data |>
select(!!.entrez, !!.arrange_desc) |>
distinct() |>
distinct() |>

# Select one entrez - NEEDED?
with_groups(c(!!.entrez,!!.arrange_desc ), slice, 1) |>
with_groups(c(!!.entrez,!!.arrange_desc ), slice, 1) |>

# arrange
# arrange
arrange(desc(!!.arrange_desc)) |>

# Format
deframe() |>
entrez_rank_to_gsea(species, gene_collections = gene_sets ) |>
Expand Down Expand Up @@ -4967,3 +4967,30 @@ as_matrix <- function(tbl,
}


#' Resolve Complete Confounders of Non-Interest
#'
#' This generic function processes a SummarizedExperiment object to handle confounders
#' that are not of interest in the analysis. It dynamically handles combinations
#' of provided factors, adjusting the data by nesting and summarizing over these factors.
#'
#'
#' @param se A SummarizedExperiment object that contains the data to be processed.
#' @param ... Arbitrary number of factor variables represented as symbols or quosures
#' to be considered for resolving confounders. These factors are processed
#' in combinations of two.
#'
#' @rdname resolve_complete_confounders_of_non_interest-methods
#'
#' @return A modified SummarizedExperiment object with confounders resolved.
#'
#' @examples
#' # Not run:
#' # se is a SummarizedExperiment object
#' # resolve_complete_confounders_of_non_interest(se, factor1, factor2, factor3)
#' @export
setGeneric("resolve_complete_confounders_of_non_interest", function(se, ...) {
standardGeneric("resolve_complete_confounders_of_non_interest")
})



Loading

0 comments on commit 9c9edcc

Please sign in to comment.