diff --git a/assets/ParallelEvolCCM_supplement.tar.gz b/assets/ParallelEvolCCM_supplement.tar.gz new file mode 100644 index 00000000..ba17f759 Binary files /dev/null and b/assets/ParallelEvolCCM_supplement.tar.gz differ diff --git a/bin/ParallelEvolCCM.R b/bin/ParallelEvolCCM.R index a3bf3d4b..7918c7e5 100755 --- a/bin/ParallelEvolCCM.R +++ b/bin/ParallelEvolCCM.R @@ -1,13 +1,40 @@ #!/usr/bin/env Rscript -# Load the required packages -suppressPackageStartupMessages(library(ape)) +### ParallelEvolCCM.R +### Hosted at: https://github.com/beiko-lab/arete/blob/master/bin/ParallelEvolCCM.R +### Version 1.0, released *** June 2024 +### Released under MIT License (https://opensource.org/license/mit) +### Note: igraph requires the OpenBlas library, which can be installed with: +### sudo apt-get install libopenblas-dev +### devtools: sudo apt-get install libssl-dev libfontconfig1-dev libharfbuzz-dev libfribidi-dev libfreetype6-dev libpng-dev libtiff5-dev libjpeg-dev + +# List of CRAN packages to check and install +cran_packages <- + c("ape", + "dplyr", + "remotes", + "phytools", + "foreach", + "doParallel", + "gplots") + +for (package in cran_packages) { + if (!suppressPackageStartupMessages(require(package, character.only = TRUE))) { + install.packages(package) + } + suppressPackageStartupMessages(library(package, character.only = TRUE)) +} + +if (!require(phytools)) { + remotes::install_github("liamrevell/phytools") +} + +if (!require(evolCCM)) { + remotes::install_github("beiko-lab/evolCCM") +} + suppressPackageStartupMessages(library(evolCCM)) -suppressPackageStartupMessages(library(dplyr)) -suppressPackageStartupMessages(library(phytools)) -suppressPackageStartupMessages(library(foreach)) -suppressPackageStartupMessages(library(doParallel)) -suppressPackageStartupMessages(library(gplots)) + ##################################################################### ### FUNCTION PARSE_ARGS @@ -17,22 +44,40 @@ suppressPackageStartupMessages(library(gplots)) ##################################################################### parse_args <- function(args) { - # Check if no arguments are provided or if '-h' flag is given if (length(args) == 0 || any(args == "-h")) { - cat("Usage: RunEvolCCM.R --intree --intable [--compare_from value1,value2,...] [--compare_to valueA,valueB,...] [--cores ] [--min_abundance <0.0-1.0>] [--max_abundance <0.0-1.0>]\n") + cat( + "Usage: RunEvolCCM.R --intree --intable [--compare_from value1,value2,...] [--compare_to valueA,valueB,...] [--cores ] [--min_abundance <0.0-1.0>] [--max_abundance <0.0-1.0>]\n" + ) cat("Options:\n") cat(" --intree : Path to the tree file (required).\n") cat(" --intable : Path to the table file (required).\n") - cat(" --compare_from : Comma-separated list of values for comparison (optional).\n") - cat(" --compare_to : Comma-separated list of values for comparison (optional).\n") - cat(" --cores : Specify number of cores for parallel processing or '-1' for all cores (optional, default is 1).\n") - cat(" --min_abundance <0.0-1.0> : Minimum abundance proportion threshold (optional, default is 0.0).\n") - cat(" --max_abundance <0.0-1.0> : Maximum abundance proportion threshold (optional, default is 1.0).\n") + cat( + " --compare_from : Comma-separated list of values for comparison (optional).\n" + ) + cat( + " --compare_to : Comma-separated list of values for comparison (optional).\n" + ) + cat( + " --cores : Specify number of cores for parallel processing or '-1' for all cores (optional, default is 1).\n" + ) + cat( + " --min_abundance <0.0-1.0> : Minimum abundance proportion threshold (optional, default is 0.0).\n" + ) + cat( + " --max_abundance <0.0-1.0> : Maximum abundance proportion threshold (optional, default is 1.0).\n" + ) + cat( + " --show_nans : Show NaNs in matrix (default is to convert non-converging X^2 values to 0 and p-values to 1.\n" + ) cat(" -h : Show this help message.\n") - quit(save="no") + quit(save = "no") } + cat("\n\n*********************** PARALLELEVOLCCM ***********************\n\n") + + cat("Parsing arguments...\n") + # Function to parse command line arguments without '=' parseArg <- function(argname, options) { index <- which(options == argname) @@ -49,6 +94,7 @@ parse_args <- function(args) { min_abundance_str <- parseArg("--min_abundance", args) max_abundance_str <- parseArg("--max_abundance", args) cores_str <- parseArg("--cores", args) + show_nans <- "--show_nans" %in% args # Parse the optional --compare_from and --compare_to arguments compare_from_str <- parseArg("--compare_from", args) @@ -74,7 +120,8 @@ parse_args <- function(args) { if (!is.na(temp_val) && temp_val >= 0.0 && temp_val <= 1.0) { min_abund <- temp_val } else { - stop("--min_abundance must be a proportional value between 0.0 and 1.0", call. = FALSE) + stop("--min_abundance must be a proportional value between 0.0 and 1.0", + call. = FALSE) } } @@ -84,15 +131,18 @@ parse_args <- function(args) { if (!is.na(temp_val) && temp_val >= 0.0 && temp_val <= 1.0) { max_abund <- temp_val } else { - stop("--max_abundance must be a proportional value between 0.0 and 1.0", call. = FALSE) + stop("--max_abundance must be a proportional value between 0.0 and 1.0", + call. = FALSE) } } # Check the relationship between min_abund and max_abund if (min_abund > max_abund) { - stop("--min_abundance value must be less than or equal to --max_abundance value", call. = FALSE) + stop("--min_abundance value must be less than or equal to --max_abundance value", + call. = FALSE) } + # Determine the parallel processing parameters if (is.null(cores_str)) { num_cores <- 1 } else if (cores_str == "-1") { @@ -100,22 +150,29 @@ parse_args <- function(args) { } else { num_cores <- as.integer(cores_str) if (is.na(num_cores) || num_cores <= 0) { - stop("--cores argument must be a positive integer or '-1' for all cores", call. = FALSE) + stop("--cores argument must be a positive integer or '-1' for all cores", + call. = FALSE) } } - registerDoParallel(cores=num_cores) # Check if both --intree and --intable are provided if (is.null(inputTree) || is.null(inputProfile)) { - stop("Usage: RunEvolCCM.R --intree --intable [--compare_from value1,value2,...] [--compare_to valueA,valueB,...]", call. = FALSE) + stop( + "Usage: RunEvolCCM.R --intree --intable [--compare_from value1,value2,...] [--compare_to valueA,valueB,...]", + call. = FALSE + ) } - parsed_list <- list("inputTree" = inputTree, - "inputProfile" = inputProfile, - "compare_from_vector" = compare_from_vector, - "compare_to_vector" = compare_to_vector, - "min_abund" = min_abund, - "max_abund" = max_abund) + parsed_list <- list( + "inputTree" = inputTree, + "inputProfile" = inputProfile, + "compare_from_vector" = compare_from_vector, + "compare_to_vector" = compare_to_vector, + "min_abund" = min_abund, + "max_abund" = max_abund, + "show_nans" = show_nans, + "num_cores" = num_cores + ) return(parsed_list) } @@ -128,20 +185,25 @@ parse_args <- function(args) { ### RETURNS the corrected tree ########################################################################## -modify_tree <- function(tree,fix_multifurcations,coerce_branches) { - +modify_tree <- function(tree, + fix_multifurcations, + coerce_branches) { if (!is.rooted(tree)) { - warning("Input tree is not rooted. Applying midpoint rooting algorithm.") + cat(noquote( + "*** Input tree is not rooted! Applying midpoint rooting algorithm. ***\n" + )) tree <- midpoint.root(tree) } ### Arbitrarily resolve multifurcations and zero-length branches ### if (!is.binary(tree)) { if (fix_multifurcations) { - warning("Input tree is not binary! Forcing the issue.") - tree<-multi2di(tree) + cat(noquote("*** Input tree is not binary! Forcing the issue. ***\n")) + tree <- multi2di(tree) } else { - stop("Tree contains multifurcations and \"fix_multifurcations\" is set to FALSE. Exiting...") + stop( + "Tree contains multifurcations and \"fix_multifurcations\" is set to FALSE. Exiting..." + ) } } @@ -149,9 +211,23 @@ modify_tree <- function(tree,fix_multifurcations,coerce_branches) { if (coerce_branches) { short_branch_count <- sum(tree$edge.length < min_branch_length) if (short_branch_count > 0) { - notification_string = paste(short_branch_count, " of ",nrow(tree$edge)," edges are less than the specified threshold of ",min_branch_length,".", sep = "") - warning(paste(notification_string,"Setting these branches to equal the threshold; this will affect the results!")) - tree$edge.length[tree$edge.length < min_branch_length] <- min_branch_length + notification_string = paste( + short_branch_count, + " of ", + nrow(tree$edge), + " edges are less than the specified threshold of ", + min_branch_length, + ".", + sep = "" + ) + warning( + paste( + notification_string, + "Setting these branches to equal the threshold; this will affect the results!" + ) + ) + tree$edge.length[tree$edge.length < min_branch_length] <- + min_branch_length } } return(tree) @@ -167,94 +243,106 @@ modify_tree <- function(tree,fix_multifurcations,coerce_branches) { ### RETURN a filtered profile, as long as nothing is broken ######################################################################################## -check_and_filter_profile <- function(aprofile,tree,min_abund,max_abund) { - - # Get the leaf labels from the phylogenetic tree - tree_labels <- tree$tip.label - - # Get the row labels from the dataframe - df_labels <- rownames(aprofile) - - mismatch <- 0 - # Find labels that are in the tree but not in the dataframe - missing_in_df <- setdiff(tree_labels, df_labels) - if (length(missing_in_df) > 0) { - print("Labels in the tree but not in the dataframe:",quote=FALSE) - print(missing_in_df) - mismatch = 2 - } +check_and_filter_profile <- + function(aprofile, tree, min_abund, max_abund) { + # Get the leaf labels from the phylogenetic tree + tree_labels <- tree$tip.label + + # Get the row labels from the dataframe + df_labels <- rownames(aprofile) + + mismatch <- 0 + # Find labels that are in the tree but not in the dataframe + missing_in_df <- setdiff(tree_labels, df_labels) + if (length(missing_in_df) > 0) { + cat("Labels in the tree but not in the dataframe:", quote = FALSE) + cat(missing_in_df) + mismatch = 2 + } - # Find labels that are in the dataframe but not in the tree - missing_in_tree <- setdiff(df_labels, tree_labels) - if (length(missing_in_tree) > 0) { - print("Labels in the dataframe but not in the tree:",quote=FALSE) - print(missing_in_tree) - mismatch = 1 - } + # Find labels that are in the dataframe but not in the tree + missing_in_tree <- setdiff(df_labels, tree_labels) + if (length(missing_in_tree) > 0) { + cat("Labels in the dataframe but not in the tree:", quote = FALSE) + cat(missing_in_tree) + mismatch = 1 + } - if (mismatch == 1) { - stop("Exiting...") - } + if (mismatch == 1) { + stop("Exiting...") + } - print("All names match between tree and matrix.",quote=FALSE) - - ### ABUNDANCE FILTER ### - - aprofile <- aprofile[tree$tip.label,] - aprofile[aprofile > 1] <- 1 - - numFeatures <- dim(aprofile)[2] - print(paste("Total number of features: ",numFeatures),quote=FALSE) - - ### Abundance filter: subset the dataframe to include only those columns that satisfy the proportion criteria ### - proportion_ones <- apply(aprofile, 2, function(col) sum(col == 1) / length(col)) - aprofile <- aprofile[, proportion_ones >= min_abund & proportion_ones <= max_abund] - numFeatures <- dim(aprofile)[2] - print(paste("Total number of features post abundance filtering: ",numFeatures),quote=FALSE) - - ### CLADE FILTERING: Remove any feature that maps perfectly onto a clade in the rooted phylogenetic tree ### - # This function will recursively explore the tree and return the clades - # Function to get all descendant tips of a node - #if(clade_filter) { - if(FALSE) { - get_descendant_tips <- function(tree, node) { - if (node <= length(tree$tip.label)) { - return(tree$tip.label[node]) - } else { - descend <- which(tree$edge[, 1] == node) - tips <- c() - for (i in descend) { - tips <- c(tips, get_descendant_tips(tree, tree$edge[i, 2])) + cat("All names match between tree and matrix.\n") + + ### ABUNDANCE FILTER ### + + aprofile <- aprofile[tree$tip.label,] + aprofile[aprofile > 1] <- 1 + + numFeatures <- dim(aprofile)[2] + cat(paste("Total number of features: ", numFeatures, "\n")) + + ### Abundance filter: subset the dataframe to include only those columns that satisfy the proportion criteria ### + proportion_ones <- + apply(aprofile, 2, function(col) + sum(col == 1) / length(col)) + aprofile <- + aprofile[, proportion_ones >= min_abund & + proportion_ones <= max_abund] + numFeatures <- dim(aprofile)[2] + cat(paste( + "Total number of features post abundance filtering: ", + numFeatures, + "\n" + )) + + ### CLADE FILTERING: Remove any feature that maps perfectly onto a clade in the rooted phylogenetic tree ### + # This function will recursively explore the tree and return the clades + # Function to get all descendant tips of a node + #if(clade_filter) { + if (FALSE) { + get_descendant_tips <- function(tree, node) { + if (node <= length(tree$tip.label)) { + return(tree$tip.label[node]) + } else { + descend <- which(tree$edge[, 1] == node) + tips <- c() + for (i in descend) { + tips <- c(tips, get_descendant_tips(tree, tree$edge[i, 2])) + } + return(tips) } - return(tips) } - } - - # Get the clades for all internal nodes - clades <- lapply((length(tree$tip.label) + 1):(length(tree$tip.label) + tree$Nnode), - function(node) get_descendant_tips(tree, node)) - # Function to check if a feature is unique to a clade - check_feature <- function(feature){ - for(clade in clades){ - clade_data <- aprofile[unlist(clade), feature] - outside_clade_data <- aprofile[setdiff(rownames(aprofile), unlist(clade)), feature] - - if (all(clade_data == 1, na.rm = TRUE) & all(outside_clade_data == 0, na.rm = TRUE)){ - return(TRUE) + # Get the clades for all internal nodes + clades <- + lapply((length(tree$tip.label) + 1):(length(tree$tip.label) + tree$Nnode), + function(node) + get_descendant_tips(tree, node)) + + # Function to check if a feature is unique to a clade + check_feature <- function(feature) { + for (clade in clades) { + clade_data <- aprofile[unlist(clade), feature] + outside_clade_data <- + aprofile[setdiff(rownames(aprofile), unlist(clade)), feature] + + if (all(clade_data == 1, na.rm = TRUE) & + all(outside_clade_data == 0, na.rm = TRUE)) { + return(TRUE) + } } + return(FALSE) } - return(FALSE) - } - for(feature in colnames(aprofile)){ - if(check_feature(feature)){ - filtered_data <- filtered_data %>% select(-feature) + for (feature in colnames(aprofile)) { + if (check_feature(feature)) { + filtered_data <- filtered_data %>% select(-feature) + } } } + return(aprofile) } - return(aprofile) -} #################################################################### ### FUNCTION PROCESS_PAIR @@ -269,11 +357,12 @@ process_pair <- function(i, j, filtered_data, tree) { res <- tryCatch({ #if ( if (j %% 50 == 0) { - print(paste(i,"vs",j)) + print(paste(i, "vs", j)) } # Run EstimateCCM for the current pair - aE <- EstimateCCM(filtered_data[, c(i, j)], phytree = tree, trace = FALSE) + aE <- + EstimateCCM(filtered_data[, c(i, j)], phytree = tree, trace = FALSE) estimatedRates <- aE$nlm.par # Retrieve the score and pvalue @@ -285,7 +374,12 @@ process_pair <- function(i, j, filtered_data, tree) { res <- c(aE$nlm.par, interact_score, interact_pval) # Return the result along with i, j, and column names - list(i = i, j = j, colnames = colnames(filtered_data[, c(i, j)]), res = res) + list( + i = i, + j = j, + colnames = colnames(filtered_data[, c(i, j)]), + res = res + ) #} }, error = function(e) { # Return NULL in case of an error @@ -305,8 +399,7 @@ process_pair <- function(i, j, filtered_data, tree) { ########################################################################## constructMatrix <- function(lines, outputFile, column) { - -# Initialize an empty list to store the entities + # Initialize an empty list to store the entities entities <- list() for (line in lines) { @@ -326,7 +419,8 @@ constructMatrix <- function(lines, outputFile, column) { } # Initialize an empty matrix to store the distances - distances <- matrix(nrow = length(entities), ncol = length(entities)) + distances <- + matrix(nrow = length(entities), ncol = length(entities)) for (line in lines) { # Parse the line into fields @@ -347,7 +441,10 @@ constructMatrix <- function(lines, outputFile, column) { colnames(distances) <- entities # Write the matrix to a file - write.table(distances, file = outputFile, sep = "\t", quote = FALSE) + write.table(distances, + file = outputFile, + sep = "\t", + quote = FALSE) } @@ -379,60 +476,82 @@ args <- commandArgs(trailingOnly = TRUE) parsed <- parse_args(args) ### Output file name (could be changed to a command-line argument) ### -outputTree <- paste("EvolCCM_", parsed$inputTree, sep = "") -outputFile <- paste("EvolCCM_", parsed$inputProfile, sep = "") +outputTree <- + paste("EvolCCM_", basename(parsed$inputTree), sep = "") +outputFile <- + paste("EvolCCM_", basename(parsed$inputProfile), sep = "") ###################### READ THE TREE FILE AND FIX IF NECESSARY ###################### +cat("\nReading tree...\n") tree <- read.tree(parsed$inputTree) -tree <- modify_tree(tree,fix_multifurcations,coerce_branches) -write.tree(tree,outputTree) +tree <- modify_tree(tree, fix_multifurcations, coerce_branches) +write.tree(tree, outputTree) ###################### READ THE PROFILE AND APPLY FILTERS ###################### ### Read the profile file ### -aprofile <- read.csv(parsed$inputProfile, row.names=1,sep="\t") -filtered_data <- check_and_filter_profile(aprofile, tree, parsed$min_abund, parsed$max_abund) +cat("\nReading profile...\n") +aprofile <- read.csv(parsed$inputProfile, row.names = 1, sep = "\t") +filtered_data <- + check_and_filter_profile(aprofile, tree, parsed$min_abund, parsed$max_abund) numFeatures <- dim(filtered_data)[2] ######################## PARALLELIZED NESTED LOOP FOR PAIRWISE PROFILE COMPARISONS ######################## +cat("\nInitiating comparisons...\n") +registerDoParallel(cores = parsed$num_cores) + compare_from_vector <- parsed$compare_from_vector compare_to_vector <- parsed$compare_to_vector -results <- foreach(i = 1:(numFeatures - 1), .combine = 'c', .multicombine = TRUE) %dopar% { - i_name <- colnames(filtered_data)[i] - - # If compare_from_str is defined, then check the condition. If not, then set to TRUE by default. - i_condition <- ifelse(!is.null(compare_from_vector), - any(sapply(compare_from_vector, function(x) {startsWith(i_name, x)})), - TRUE) - - if (i_condition) { - print(paste("Processing feature #", i, " (", i_name, ")", sep = ""), quote = FALSE) - - sapply((i + 1):numFeatures, function(j) { - j_name <- colnames(filtered_data)[j] - - # If compare_to_str is defined, then check the condition. If not, then set to TRUE by default. - j_condition <- ifelse(!is.null(compare_to_vector), - any(sapply(compare_to_vector, function(x) {startsWith(j_name, x)})), - TRUE) - - if (j_condition) { - return(process_pair(i, j, filtered_data, tree)) - } else { - return(NULL) - } - }, simplify = FALSE) - } else { - return(NULL) +results <- + foreach( + i = 1:(numFeatures - 1), + .combine = 'c', + .multicombine = TRUE + ) %dopar% { + i_name <- colnames(filtered_data)[i] + + # If compare_from_str is defined, then check the condition. If not, then set to TRUE by default. + i_condition <- ifelse(!is.null(compare_from_vector), + any(sapply(compare_from_vector, function(x) { + startsWith(i_name, x) + })), + TRUE) + + if (i_condition) { + print(paste("Processing feature #", i, " (", i_name, ")", sep = ""), + quote = FALSE) + + sapply((i + 1):numFeatures, function(j) { + j_name <- colnames(filtered_data)[j] + + # If compare_to_str is defined, then check the condition. If not, then set to TRUE by default. + j_condition <- ifelse(!is.null(compare_to_vector), + any(sapply(compare_to_vector, function(x) { + startsWith(j_name, x) + })), + TRUE) + + if (j_condition) { + return(process_pair(i, j, filtered_data, tree)) + } else { + return(NULL) + } + }, simplify = FALSE) + } else { + return(NULL) + } } -} + +cat("\nProfile comparisons complete!\n") ################################# OUTPUT ##################################### +cat("\nWriting output...\n") + ### Process the results and write them to chunks of output files ### chunk_size <- 100 output_file_pattern <- "output_temp_chunk_%d.txt" @@ -445,9 +564,23 @@ for (result in results) { temp_output_file <- sprintf(output_file_pattern, chunk_index) chunk_index <- chunk_index + 1 } - cat(paste(result$colnames,collapse="\t"), file = temp_output_file, append = TRUE) + ### Fix NaNs unless user has specified otherwise + if (parsed$show_nans != 1) { + if (is.nan(result$res[6])) { + # print(result$res) + # print("Fixing NaN") + result$res[6] = 0 + result$res[7] = 1 + # print(result$res) + } + } + cat(paste(result$colnames, collapse = "\t"), + file = temp_output_file, + append = TRUE) cat("\t", file = temp_output_file, append = TRUE) - cat(paste(result$res,collapse="\t"), file = temp_output_file, append = TRUE) + cat(paste(result$res, collapse = "\t"), + file = temp_output_file, + append = TRUE) cat("\n", file = temp_output_file, append = TRUE) result_counter <- result_counter + 1 @@ -457,7 +590,19 @@ for (result in results) { # Merge the chunks of output files into the main output file temp_files <- list.files(pattern = "output_temp_chunk_[0-9]+\\.txt") -outputHeadings <- paste("feature1","feature2","intrisic1", "intrisic2", "gainloss1", "gainloss2", "interaction", "interact_score", "interact_pval",sep="\t") +outputHeadings <- + paste( + "feature1", + "feature2", + "intrisic1", + "intrisic2", + "gainloss1", + "gainloss2", + "interaction", + "interact_score", + "interact_pval", + sep = "\t" + ) outputFilegz = gzfile(outputFile, "w") cat(outputHeadings, file = outputFilegz, sep = "\n") for (temp_file in temp_files) { @@ -465,7 +610,10 @@ for (temp_file in temp_files) { temp_contents <- readLines(temp_file) # Write the contents to the main output file - cat(temp_contents, file = outputFilegz, append = TRUE, sep = "\n") + cat(temp_contents, + file = outputFilegz, + append = TRUE, + sep = "\n") # Remove the temporary file file.remove(temp_file) @@ -474,8 +622,10 @@ for (temp_file in temp_files) { # Read the lines from the input file close(outputFilegz) lines <- readLines(outputFile)[-1] -X2file = paste(outputFile,"X2",sep = ".") -Pfile = paste(outputFile,"pvals",sep = ".") +X2file = paste(outputFile, "X2", sep = ".") +Pfile = paste(outputFile, "pvals", sep = ".") + +constructMatrix(lines, X2file, 8) +constructMatrix(lines, Pfile, 9) -constructMatrix(lines,X2file,8) -constructMatrix(lines,Pfile,9) +cat("ParallelEvolCCM run done.\n\n") diff --git a/docs/evolccm.md b/docs/evolccm.md index a8d21043..3e886458 100644 --- a/docs/evolccm.md +++ b/docs/evolccm.md @@ -30,6 +30,31 @@ ED178 0 1 ED180 0 0 ``` +## Using ParallelEvolCCM with ARETE + +The ParallelEvolCCM tool is also made available through the `evolccm` entry in ARETE. +Making it possible to run the tool with Docker or Singularity. + +To execute the ParallelEvolCCM tool with ARETE, run the following command: + +```bash +nextflow run beiko-lab/ARETE \ + -entry evolccm \ + --core_gene_tree core_gene_alignment.tre \ + --feature_profile feature_profile.tsv.gz \ + -profile docker +``` + +The parameters being: + +- `--core_gene_tree` - The reference tree, coming from a core genome alignment, + like the one generated by the `phylo` entry in ARETE. +- `--feature_profile` - A presence/absence TSV matrix of features + in genomes, like the one created in ARETE's `annotation` entry. +- `-profile` - The profile to use. In this case, `docker`. + +For more information, check the [full ARETE documentation](https://beiko-lab.github.io/arete/). + ## Using ParallelEvolCCM by itself The ParallelEvolCCM tool is a command line tool written in R. @@ -42,8 +67,17 @@ wget https://raw.githubusercontent.com/beiko-lab/arete/master/bin/ParallelEvolCC chmod +x ParallelEvolCCM.R ``` -Then, ensure all EvolCCM dependencies are installed. -You can install them by running the following command in your R console: +ParallelEvolCCM.R has several dependencies, which should automatically be installed +the first time you run the script. You may need to install missing Linux packages using +the following command: + +```bash +sudo apt-get install libssl-dev libfontconfig1-dev libharfbuzz-dev \ +libfribidi-dev libfreetype6-dev libpng-dev libtiff5-dev libjpeg-dev \ +libopenblas-dev +``` + +If you prefer to install the dependencies manually, you can do so by running the following R commands: ```r install.packages(c('ape', 'dplyr', 'phytools', 'foreach', 'doParallel', 'gplots', 'remotes')) @@ -62,27 +96,118 @@ You can then run the tool like this: Additional parameters can be found by running `./ParallelEvolCCM.R` with no additional parameters. -## Using ParallelEvolCCM with ARETE +## ParallelEvolCCM Release Package -The ParallelEvolCCM tool is also made available through the `evolccm` entry in ARETE. -Making it possible to run the tool with Docker or Singularity. +We also provide test data that can be used to run EvolCCM along with some useful scripts +for downstream analyses. These can be found in a .tar.gz file, which you can download like this: -To execute the ParallelEvolCCM tool with ARETE, run the following command: +```bash +wget https://raw.githubusercontent.com/beiko-lab/arete/master/assets/ParallelEvolCCM_supplement.tar.gz +tar -xzf ParallelEvolCCM_supplement.tar.gz +``` + +### Structure + +The tarball contains three subfolders: + +- `Scripts/` - The R scripts used to generate feature and statistical histograms, and the +Python script that is used to build the GraphML files from PECCM output. + +- `100Bifido/` - The results of the 100-genome analysis described in the paper. + +- `1000Bifido/` - The results of the 1000-genome analysis described in the paper. + +Each of the results folders contains the following subdirectories and files. X is a placeholder for the size (i.e., 100 or 1000): + +- `SourceFiles/` - The source files. + + - `Bifido_X.tre`: The Newick-formatted tree + + - `Bifido_X_feature_profile`: The tab-separated feature file. **This is the input +file for ‘PECCM BuildFeatureHistogram.R’, see usage below** + +- `Results/` - The results produced by PECCM and the helper scripts. + + - `EvolCCM_Bifido_X.tre`: The tree used by EvolCCM (with midpoint rooting +and multifurcating node resolution if necessary) + + - `EvolCCM_Bifido_X_feature_profile.tsv`: A tab-separated file with the p-values +and statistics for all pairwise comparisons between features. This is the +input file for the scripts `PECCM_BuildStatHistogram.R` +and `PECCM_Build_GraphML.py`, see below + + - `EvolCCM_Bifido_X_feature_profile.tsv.pvals` and +`EvolCCM_Bifido_X_feature_profile.tsv.X2`: Tab-separated matrices showing +the p-values and X2 scores for all features. + + - `EvolCCM_Bifido_100.graphml`: GraphML-formatted file with connections between features. + + - Four .jpg files: 'a' is the output feature profile, and 'b', 'c', and 'd' are the +feature and statistical distributions. + +### Additional Scripts + +First, in order to recreate the results of +the 100-genome dataset, run the command below (specifying any reasonable number of cores): ```bash -nextflow run beiko-lab/ARETE \ - -entry evolccm \ - --core_gene_tree core_gene_alignment.tre \ - --feature_profile feature_profile.tsv.gz \ - -profile docker +ParallelEvolCCM.R --intree Bifido_100.tre \ + --intable Bifido_100_feature_profile.tsv \ + --min abundance 0.05 --max abundance 0.95 --cores 8 ``` -The parameters being: +Four output files will be produced. All will be prefixed with 'EvolCCM' to distiguish +them from the input files. -- `--core_gene_tree` - The reference tree, coming from a core genome alignment, - like the one generated by the `phylo` entry in ARETE. -- `--feature_profile` - A presence/absence TSV matrix of features - in genomes, like the one created in ARETE's `annotation` entry. -- `-profile` - The profile to use. In this case, `docker`. +- .tre file: The tree used by EvolCCM (with midpoint rooting and multifurcating +node resolution if necessary). This file will end with a ‘.tre’ extension. + +- .tsv file: Statistics associated with the EvolCCM comparisons, with one line for +each pairwise comparison. + +- .tsv.pvals file: A matrix showing the p-values from all-versus-all comparisons between features. + +- .tsv.X2 file: A matrix showing the X2 values from all-versus-all comparisons between features. + +--- + +Next, `PECCM_BuildFeatureHistogram.R` can be used to generate the feature distribution +histogram. Usage is: + +```bash +Rscript PECCM_BuildFeatureHistogram.R infile +``` + +Where ‘infile’ is the input feature table (for example, Bifido_100_feature_profile.tsv). +There are no other command-line options. A single .jpg file will be produced. + +--- + +`PECCM_BuildStatHistogram.R` is used to generate the statistical summary histograms. Usage is: + +```bash +Rscript PECCM_BuildStatHistogram.R infile +``` + +Where ‘infile’ is the input table of results (‘EvolCCM...tsv’). +Three .jpg files will be produced. + +--- + +`PECCM_Build_GraphML.py` is used to generate a graph from the pairwise comparisons, +with optional p-value thresholding. You can also use the --attribute_name_length +option to truncate attribute names for visual purposes. + +The optional ‘type_underscore’ argument will treat the first part of each attribute name +(up to the first underscore) as its type: for example, ‘plasmid_ABC’ and ‘plasmid_def’ +would both be treated as objects of type ‘plasmid’, with names ‘ABC’ and ‘DEF’, respectively. + +Usage: + +```bash +python ../PECCM_Build_GraphML.py \ + --attribute_name_length 10 \ + --type_underscore EvolCCM_Bifido_100_feature_profile.tsv \ + EvolCCM_Bifido_100.graphml +``` -For more information, check the [full ARETE documentation](https://beiko-lab.github.io/arete/). diff --git a/docs/faq.md b/docs/faq.md index 141d7031..3868321f 100644 --- a/docs/faq.md +++ b/docs/faq.md @@ -99,3 +99,17 @@ process { Feel free to change the values above as you wish and then add `-c nextflow.config` to your `nextflow run beiko-lab/ARETE` command. You can point to general process labels, like `process_low`, or you can point directly to process names, like `MOB_RECON`. Learn more at [our usage documentation](https://beiko-lab.github.io/arete/usage/#custom-resource-requests) or [the official nextflow documentation](https://www.nextflow.io/docs/latest/config.html#scope-process). + +## How do I add extra parameters when executing a tool in ARETE? + +Simple! As above, create a file called `nextflow.config` in your working directory and add the following to it: + +```nextflow +withName: EVOLCCM { + ext.args = '--max_abundance 0.8' +} +``` + +Then, add `-c nextflow.config` to your `nextflow run beiko-lab/ARETE` command. +The command above will attach the `--max_abundance 0.8` parameter to the execution of the +`EVOLCCM` tool. diff --git a/modules/local/evolccm.nf b/modules/local/evolccm.nf index a0c3c275..f64cd7f4 100644 --- a/modules/local/evolccm.nf +++ b/modules/local/evolccm.nf @@ -22,6 +22,7 @@ process EVOLCCM { task.ext.when == null || task.ext.when script: + def args = task.ext.args ?: '' """ ParallelEvolCCM.R \\ --intree \\ @@ -29,7 +30,8 @@ process EVOLCCM { --intable \\ $feature_table \\ --cores \\ - $task.cpus + $task.cpus \\ + $args """ stub: diff --git a/modules/nf-core/fastqc/main.nf b/modules/nf-core/fastqc/main.nf index 47fd0e58..b822bd54 100644 --- a/modules/nf-core/fastqc/main.nf +++ b/modules/nf-core/fastqc/main.nf @@ -4,8 +4,8 @@ process FASTQC { conda (params.enable_conda ? "bioconda::fastqc=0.11.9" : null) container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/fastqc:0.11.9--0' : - 'quay.io/biocontainers/fastqc:0.11.9--0' }" + 'https://depot.galaxyproject.org/singularity/fastqc:0.12.1--hdfd78af_0' : + 'quay.io/biocontainers/fastqc:0.12.1--hdfd78af_0' }" input: tuple val(meta), path(reads)