statOmics · cvanderaa · Sep 3, 2024 · Aug 22, 2024 · Aug 22, 2024 · Aug 23, 2024
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -82,7 +82,7 @@ Collate: 'msqrob-framework.R' 'allGenerics.R'
 Encoding: UTF-8
 VignetteBuilder: knitr
 Roxygen: list(markdown=TRUE)
-RoxygenNote: 7.3.1
+RoxygenNote: 7.3.2
 biocViews:
     Proteomics,
     MassSpectrometry,

diff --git a/NAMESPACE b/NAMESPACE
@@ -8,6 +8,7 @@ export(getDF)
 export(getDfPosterior)
 export(getFitMethod)
 export(getModel)
+export(getReferencePresent)
 export(getSigma)
 export(getSigmaPosterior)
 export(getVar)

diff --git a/R/StatModel-methods.R b/R/StatModel-methods.R
@@ -11,6 +11,10 @@
 #'        the linear model coefficients to be tested equal to zero.
 #'        The rownames of the matrix should be equal to the names of
 #'        parameters of the model.
+#' @param acceptDifferentReference `boolean(1)` to indicate if the contrasts that involve
+#'        parameters with modified reference levels are returned. Watch out putting this
+#'        parameter to TRUE can change the interpretation of the logFC. Default is FALSE.
+#'
 #' @examples
 #' data(pe)
 #' # Aggregate peptide intensities in protein expression values
@@ -33,11 +37,14 @@
 
 setMethod(
     "getContrast", "StatModel",
-    function(object, L) {
+    function(object, L, acceptDifferentReference = FALSE) {
         if (!is(L, "matrix")) L <- as.matrix(L)
-        coefs <- getCoef(object)
         out <- matrix(rep(NA, ncol(L)))
         rownames(out) <- colnames(L)
+        if (referenceContrast(getReferencePresent(object), L, acceptDifferentReference)) {
+            return(out)
+        }
+        coefs <- getCoef(object)
         hlp <- try(t(L) %*% coefs[rownames(L)], silent = TRUE)
         if (!is(hlp, "try-error")) out[] <- hlp
         return(out)
@@ -49,10 +56,13 @@ setMethod(
 
 setMethod(
     "varContrast", "StatModel",
-    function(object, L) {
+    function(object, L, acceptDifferentReference = FALSE) {
         if (!is(L, "matrix")) L <- as.matrix(L)
         out <- matrix(NA, ncol(L), ncol(L))
         rownames(out) <- colnames(out) <- colnames(L)
+        if (referenceContrast(getReferencePresent(object), L, acceptDifferentReference)) {
+            return(out)
+        }
         vcovTmp <- getVcovUnscaled(object) * object@varPosterior
         hlp <- try(t(L) %*% vcovTmp[rownames(L), rownames(L)] %*% L, silent = TRUE)
         if (!is(hlp, "try-error")) out[] <- hlp

diff --git a/R/accessors.R b/R/accessors.R
@@ -14,6 +14,8 @@
 #'              \item{getSigmaPosterior(object)}{to get the empirical Bayes standard deviation}
 #'              \item{getVcovUnscaled(object)}{to get the unscaled variance covariance matrix
 #'                of the model parameters}
+#'              \item{getReferencePresent(object)}{to get the vector of logicals indicating if the
+#'               reference level associated with a model parameter is present}
 #'              }
 #'
 #' @rdname statModelAccessors
@@ -95,3 +97,9 @@ setMethod("getVcovUnscaled",
     signature = "StatModel",
     definition = function(object) object@params$vcovUnscaled
 )
+
+#' @rdname statModelAccessors
+setMethod("getReferencePresent",
+    signature = "StatModel",
+    definition = function(object) object@params$referencePresent
+)
diff --git a/R/allGenerics.R b/R/allGenerics.R
@@ -19,7 +19,9 @@ setGeneric("getVarPosterior", function(object) standardGeneric("getVarPosterior"
 #' @export
 setGeneric("getSigmaPosterior", function(object) standardGeneric("getSigmaPosterior"))
 #' @export
-setGeneric("getContrast", function(object, L) standardGeneric("getContrast"))
+setGeneric("getReferencePresent", function(object) standardGeneric("getReferencePresent"))
+#' @export
+setGeneric("getContrast", function(object, ...) standardGeneric("getContrast"))
 #' @export
 setGeneric("getVcovUnscaled", function(object) standardGeneric("getVcovUnscaled"))
 #' @export

diff --git a/R/hypothesisTest-methods.R b/R/hypothesisTest-methods.R
@@ -122,16 +122,16 @@ setMethod(
     adjust.method = "BH",
     modelColumn = "msqrobModels",
     resultsColumnNamePrefix = "",
-    overwrite = FALSE) {
+    overwrite = FALSE,
+    acceptDifferentReference = FALSE) {
         if (!(modelColumn %in% colnames(rowData(object)))) stop("There is no column named \'", modelColumn, "\' with stored models of an msqrob fit in the rowData of the SummarizedExperiment object")
         if (is.null(colnames(contrast)) & resultsColumnNamePrefix == "") resultsColumnNamePrefix <- "msqrobResults"
         if (is.null(colnames(contrast)) & ncol(contrast) > 1) colnames(contrast) <- seq_len(ncol(contrast))
         if ((sum(paste0(resultsColumnNamePrefix, colnames(contrast)) %in% colnames(rowData(object))) > 0) & !overwrite) stop("There is/are already column(s) with names starting with\'", resultsColumnNamePrefix, "\' in the rowData of the SummarizedExperiment object, set the argument overwrite=TRUE to replace the column(s) with the new results or use another name for the argument resultsColumnNamePrefix")
         for (j in seq_len(ncol(contrast)))
         {
-            contrHlp <- contrast[, j]
-            names(contrHlp) <- rownames(contrast)
-            rowData(object)[[paste0(resultsColumnNamePrefix, colnames(contrast)[j])]] <- topFeatures(rowData(object)[, modelColumn], contrast = contrHlp, adjust.method = adjust.method, sort = FALSE, alpha = 1)
+            contrHlp <- contrast[, j, drop = FALSE]
+            rowData(object)[[paste0(resultsColumnNamePrefix, colnames(contrast)[j])]] <- topFeatures(rowData(object)[, modelColumn], contrast = contrHlp, adjust.method = adjust.method, sort = FALSE, alpha = 1, acceptDifferentReference = acceptDifferentReference)
         }
         return(object)
     }

diff --git a/R/msqrob-utils.R b/R/msqrob-utils.R
@@ -86,9 +86,126 @@ makeContrast <- function(contrasts, parameterNames) {
     return(t(.chrlinfct2matrix(contrasts, parameterNames)$K))
 }
 
+
+#' Check for presence of reference levels
+#' @description Check if the reference levels of the factors in the model are present for a specific feature.
+#'
+#' @param y A numeric vector with the feature values for each sample.
+#'
+#' @param data A `DataFrame` with information on the design. It has
+#'        the same number of rows as the number of columns (samples) of
+#'        `y`.
+#' 
+#' @param referenceLevels A named list with the reference levels of the factors (named by the factor name)
+#' 
+#' @return A logical vector indicating for each parameter (that comes from a factor variable) in the model 
+#' if his reference level has values for the current feature.
+#' 
+#' @rdname checkReference
+#'
+checkReference <- function(y, data, paramNames, formula) {
+    referenceLevels <- getReferenceLevels(data, formula)
+    factor <- names(referenceLevels)
+    subset <- droplevels(data[!is.na(y), factor, drop = FALSE])
+    paramSplit <- list()
+    for (x in factor) {
+        paramSplit[[x]] <- paramNames[grep(x, paramNames)]
+    }
+    factor_status <- unlist(lapply(factor, function(x) {
+        noIntercept <- all(paste0(x, levels(as.factor(data[, x]))) %in% paramNames)
+        refPres <- referenceLevels[[x]] %in% levels(subset[, x])
+        return(noIntercept || refPres)
+    }))
+    names(factor_status) <- factor
+
+    referencePresent <- propagateFalseStatus(paramSplit, factor_status)
+
+    return(unlist(referencePresent))
+}
+
+#' Get reference levels factors
+#' @description Get the reference levels of the factors of a data frame that are present in a formula
+#'
+#' @param dataFrame A `data.frame` that contains factors
+#' 
+#' @param formula A formula object associated with the data frame (the variables in the formula should be present in the data frame)
+#'
+#' @return A named list with the reference levels of the factors (named by the factor name)
+#' 
+#' @rdname getReferenceLevels
+#' 
+getReferenceLevels <- function(dataFrame, formula){
+    referenceLevels <- list()
+    vars <- all.vars(formula)
+    for (x in vars){
+        if (is.factor(dataFrame[, x]) || is.character(dataFrame[, x])) {
+            if (is.character(dataFrame[, x])) {
+                dataFrame[, x] <- as.factor(dataFrame[, x])
+            }
+            referenceLevels[[x]] <- levels(dataFrame[, x])[1]
+        }
+    }
+    return(unlist(referenceLevels))
+}
+
+#' Check if the reference levels associated with the contrast are present
+#' @description Check if the reference levels associated with the contrast are present
+#'
+#' @param referencePresent A logical vector indicating for each parameter in the model 
+#' if his reference level has values for the current feature.
+#'
+#' @param L A numeric contrast matrix with rownames that equal the model parameters 
+#' that are involved in the contrasts
+#'
+#' @param acceptDifferentReference `boolean(1)` to indicate if the contrasts that involve
+#' parameters with modified reference levels are returned.
+#' 
+#' @return A boolean
+#' 
+#' @rdname referenceContrast
+#' 
+referenceContrast <- function(referencePresent, L, acceptDifferentReference) {
+  if (!acceptDifferentReference && any(!referencePresent[rownames(L)])) {
+    return(TRUE)
+  }
+  return(FALSE)
+}
+
+checkFullRank <- function(modelMatrix) {
+    return(qr(modelMatrix)$rank == ncol(modelMatrix))
+}
+
+
 ##### None exported functions from multcomp package is included here to
 ##### During R and Bioc checks
 
+propagateFalseStatus <- function(vectors, statuses) {
+    repeat {
+        false_elements <- unique(unlist(vectors[statuses == FALSE]))
+
+        new_statuses <- map_lgl(seq_along(vectors), function(i) {
+            if (statuses[i] == TRUE && any(vectors[[i]] %in% false_elements)) {
+                return(FALSE)
+            } else {
+                return(statuses[i])
+            }
+        })
+
+        if (all(new_statuses == statuses)) break
+
+        statuses <- new_statuses
+    }
+
+    all_elements <- unique(unlist(vectors))
+    element_status <- setNames(rep(TRUE, length(all_elements)), all_elements)
+
+    for (i in seq_along(vectors)) {
+        element_status[vectors[[i]]] <- statuses[i]
+    }
+
+    return(element_status)
+}
+
 
 .chrlinfct2matrix <- function(ex, var) {
     if (!is.character(ex)) {
@@ -668,3 +785,4 @@ makeContrast <- function(contrasts, parameterNames) {
         ex, " to character"
     )
 }
+
diff --git a/R/msqrob.R b/R/msqrob.R
@@ -57,22 +57,29 @@ msqrobLm <- function(y,
     robust = TRUE,
     maxitRob = 5) {
     myDesign <- model.matrix(formula, data)
+    missingSamples <- colSums(!is.na(y)) == 0
+    subsettedDesign <- myDesign[!missingSamples, , drop = FALSE]
+    stopifnot("The provided model must be full rank" = checkFullRank(subsettedDesign))
+    paramNames <- colnames(myDesign)
     models <- apply(y, 1,
         function(y, design) {
             ## computatability check
             obs <- is.finite(y)
             type <- "fitError"
+
+            referencePresent <- checkReference(y, data, paramNames, formula)
             model <- list(
-                coefficients = NA, vcovUnscaled = NA,
-                sigma = NA, df.residual = NA, w = NA
+                referencePresent = referencePresent, coefficients = NA,
+                vcovUnscaled = NA, sigma = NA,
+                df.residual = NA, w = NA
             )
 
             if (sum(obs) > 0) {
                 ## subset to finite observations, attention with R column switching
                 X <- design[obs, , drop = FALSE]
-                X <- X[,colMeans(X == 0) != 1 , drop = FALSE]
+                qrX <- qr(X)
+                X <- X[, qrX$pivot[seq_len(qrX$rank)], drop = FALSE]
                 y <- y[obs]
-                colnames_orig <- colnames(design)
 
                 if (robust) {
                     ## use robust regression from MASS package, "M" estimation is used
@@ -108,16 +115,17 @@ msqrobLm <- function(y,
                 }
 
                 if (type != "fitError") {
-                    coef <- rep(NA, length(colnames_orig))
-                    names(coef) <- colnames_orig
+                    coef <- rep(NA, length(paramNames))
+                    names(coef) <- paramNames
                     coef[names(mod$coef)] <- mod$coef
-                    vcovUnscaled <- matrix(NA, nrow =length(colnames_orig), ncol = length(colnames_orig))
-                    rownames(vcovUnscaled) <- colnames(vcovUnscaled) <-  colnames_orig
+                    vcovUnscaled <- matrix(NA, nrow =length(paramNames), ncol = length(paramNames))
+                    rownames(vcovUnscaled) <- colnames(vcovUnscaled) <-  paramNames
                     vcovUnscaled[names(mod$coef), names(mod$coef)] <- msqrob2:::.vcovUnscaled(mod)
 
                     model <- list(
-                        coefficients = mod$coef,
-                        vcovUnscaled = .vcovUnscaled(mod),
+                        referencePresent = referencePresent,
+                        coefficients = coef,
+                        vcovUnscaled = vcovUnscaled,
                         sigma = sigma,
                         df.residual = df.residual,
                         w = w
@@ -770,3 +778,5 @@ msqrobGlm <- function(y,
 
     return(models)
 }
+
+
diff --git a/R/topFeatures.R b/R/topFeatures.R
@@ -18,6 +18,9 @@
 #'        to statistical significance.
 #' @param alpha `numeric` specifying the cutoff value for adjusted p-values.
 #'        Only features with lower p-values are listed.
+#' @param acceptDifferentReference `boolean(1)` to indicate if the contrasts that involve
+#'        parameters with modified reference levels are returned. Watch out putting this
+#'        parameter to TRUE can change the interpretation of the logFC. Default is FALSE.
 #'
 #' @examples
 #' data(pe)
@@ -43,29 +46,37 @@
 #' @author Lieven Clement
 #' @export
 
-topFeatures <- function(models, contrast, adjust.method = "BH", sort = TRUE, alpha = 1) {
+topFeatures <- function(models, contrast, adjust.method = "BH", sort = TRUE, alpha = 1, acceptDifferentReference = FALSE) {
     if (is(contrast, "matrix")) {
-          if (ncol(contrast) > 1) {
-                stop("Argument contrast is matrix with more than one column, only one contrast is allowed")
-            }
+        if (ncol(contrast) > 1) {
+            stop("Argument contrast is matrix with more than one column, only one contrast is allowed")
+        }
     }
-
-    contrast <- contrast[contrast !=0]
+    contrast <- contrast[contrast != 0, , drop = FALSE]
     logFC <- vapply(models,
         getContrast,
         numeric(1),
-        L = contrast
+        L = contrast,
+        acceptDifferentReference = acceptDifferentReference
     )
     se <- sqrt(vapply(models,
         varContrast,
         numeric(1),
-        L = contrast
+        L = contrast,
+        acceptDifferentReference = acceptDifferentReference
     ))
     df <- vapply(models, getDfPosterior, numeric(1))
     t <- logFC / se
     pval <- pt(-abs(t), df) * 2
     adjPval <- p.adjust(pval, method = adjust.method)
+
     out <- data.frame(logFC, se, df, t, pval, adjPval)
+    if (acceptDifferentReference) {
+        out[["DifferentReference"]] <- vapply(models, function(y){
+            any(!getReferencePresent(y)[rownames(contrast)])},
+            logical(1)
+        )
+    }
     if (sort) {
         if (alpha < 1) {
             ids <- adjPval < alpha

diff --git a/man/checkReference.Rd b/man/checkReference.Rd