diff --git a/.github/workflows/variant_remapping.yml b/.github/workflows/variant_remapping.yml index 4a1d6d2..59af196 100644 --- a/.github/workflows/variant_remapping.yml +++ b/.github/workflows/variant_remapping.yml @@ -11,7 +11,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: [3.7] + python-version: [3.8] steps: - uses: actions/checkout@v2 @@ -29,6 +29,8 @@ jobs: echo "/tmp/nextflow" >> $GITHUB_PATH cd - # $CONDA is an environment variable pointing to the root of the miniconda directory + $CONDA/bin/conda update conda + $CONDA/bin/conda install -y python=${{ matrix.python-version }} $CONDA/bin/conda env update -q --file conda.yml --name base $CONDA/bin/conda run pip install -q -r requirements.txt diff --git a/README.md b/README.md index 8d690a4..74c3a3d 100644 --- a/README.md +++ b/README.md @@ -64,3 +64,11 @@ Other files are created alongside the main output: - `_nra_variants.vcf` variants successfully remap that landed in a position where the reference allele changed. The output contains the original variant and the original reference allele as alternate. - `_unmapped.vcf` original variant that could not be successfully remap - `_count.yml` YAML file containing counts associated with each round of remapping + +## Configuration + +The pipeline relies on Nextflow configuration to set memory and runtime requirements. This is not required for all users, but it is recommended particularly for HPC and cloud environments. + +There is an [example config](tests/resources/nextflow.config) used for tests that you can modify for your own needs. The main features are the use of labels to group processes into different categories based on their resource needs (small/medium/large), and the use of `base_memory` and `base_time` variables that some processes use to fine-tune their requirements. + +For more about Nextflow configuration, see the [documentation](https://www.nextflow.io/docs/latest/config.html). diff --git a/conda.yml b/conda.yml index 0939745..52efd73 100755 --- a/conda.yml +++ b/conda.yml @@ -1,8 +1,8 @@ name: variant-remapping channels: - - defaults - conda-forge - bioconda + - defaults dependencies: - bedtools - minimap2 diff --git a/main.nf b/main.nf index 7ee6f8e..a7f84ee 100755 --- a/main.nf +++ b/main.nf @@ -1,6 +1,5 @@ #!/usr/bin/env nextflow - // Enable syntax extension // See https://www.nextflow.io/docs/latest/dsl2.html nextflow.enable.dsl=2 @@ -46,6 +45,7 @@ outfile_dir = file(params.outfile).getParent() * Uncompress VCF file */ process uncompressInputVCF { + label 'short_time', 'med_mem' input: path "source.vcf" @@ -69,6 +69,7 @@ process uncompressInputVCF { * filter VCF file to remove variant too close the edges of chromosome because we can't get flanking regions */ process filterInputVCF { + label 'default_time', 'med_mem' input: path "source.vcf" @@ -94,6 +95,7 @@ process filterInputVCF { * Store the original VCF header for later use */ process storeVCFHeader { + label 'short_time', 'small_mem' input: path "source.vcf" @@ -114,6 +116,7 @@ include { process_split_reads; process_split_reads_mid; process_split_reads_long * This process convert the original Header to the remapped header and concatenate it with the remapped VCF records */ process generateRemappedVCF { + label 'short_time', 'small_mem' input: path "vcf_header.txt" @@ -148,6 +151,7 @@ process generateRemappedVCF { * This process adds the original header to unmapped variant VCF records and output the results */ process generateUnmappedVCF { + label 'short_time', 'small_mem' publishDir outfile_dir, overwrite: true, @@ -170,6 +174,7 @@ process generateUnmappedVCF { * Sort VCF file */ process sortVCF { + label 'default_time', 'med_mem' input: path "variants_remapped.vcf" @@ -187,6 +192,7 @@ process sortVCF { * Run bcftools norm to swap the REF and ALT alleles if the REF doesn't match the new assembly */ process normalise { + label 'default_time', 'med_mem' input: path "variants_remapped_sorted.vcf.gz" @@ -202,6 +208,7 @@ process normalise { process collectNovelReferenceAlleles { + label 'short_time', 'small_mem' publishDir outfile_dir, overwrite: true, @@ -224,6 +231,7 @@ process collectNovelReferenceAlleles { * Create file containing remapping stats */ process outputStats { + label 'short_time', 'small_mem' publishDir outfile_dir, overwrite: true, @@ -244,6 +252,8 @@ process outputStats { * Concatenate the unmapped variants */ process combineUnmappedVCF { + label 'short_time', 'small_mem' + input: path "variants1.vcf" path "variants2.vcf" @@ -258,6 +268,8 @@ process combineUnmappedVCF { process combineVCF { + label 'short_time', 'small_mem' + input: path "variants1.vcf" path "variants2.vcf" @@ -271,6 +283,8 @@ process combineVCF { } process combineYaml { + label 'short_time', 'small_mem' + input: path "initial_yml" path "round1.yml" diff --git a/prepare_genome.nf b/prepare_genome.nf index 0c60077..44cf2df 100755 --- a/prepare_genome.nf +++ b/prepare_genome.nf @@ -9,8 +9,11 @@ nextflow.enable.dsl=2 * Index the new reference genome using bowtie_build */ process bowtieGenomeIndex { + label 'med_time' + // Memory required is 10 times the size of the fasta in Bytes or at least 1GB - memory Math.max(file(params.newgenome).size() * 10, 1073741824) + ' B' + // Overwrite base_memory so that the standard retry strategy is used + ext base_memory: { Math.max(file(params.newgenome).size() * 10, 1073741824) } input: path "genome_fasta" @@ -25,6 +28,7 @@ process bowtieGenomeIndex { process samtoolsFaidx { + label 'med_time', 'med_mem' input: path "genome_basename" @@ -41,6 +45,7 @@ process samtoolsFaidx { * Extract chomosome/contig sizes */ process chromSizes { + label 'short_time', 'small_mem' input: path "genome.fa.fai" diff --git a/tests/resources/config.yml b/tests/resources/config.yml deleted file mode 100644 index 4162a2c..0000000 --- a/tests/resources/config.yml +++ /dev/null @@ -1,5 +0,0 @@ -executor { - $local { - memory = '6 GB' - } -} \ No newline at end of file diff --git a/tests/resources/nextflow.config b/tests/resources/nextflow.config new file mode 100644 index 0000000..c0305ea --- /dev/null +++ b/tests/resources/nextflow.config @@ -0,0 +1,46 @@ + +executor { + name = 'local' +} + +process.ext.base_memory = 6.GB +process.ext.base_time = 10.minutes + +process { + executor = 'local' + + // Dynamic resource allocation with retries + errorStrategy = 'retry' + maxRetries = 1 + memory = { task.ext.base_memory * task.attempt } + time = { task.ext.base_time * task.attempt } + + // Labels for specific runtimes + withLabel: short_time { + ext.base_time = 5.minutes + } + withLabel: default_time { + ext.base_time = 10.minutes + } + withLabel: med_time { + ext.base_time = 30.minutes + } + withLabel: long_time { + ext.base_time = 1.hour + } + + // Labels for specific memory usage + withLabel: small_mem { + ext.base_memory = 1.GB + } + withLabel: default_mem { + ext.base_memory = 6.GB + } + withLabel: med_mem { + ext.base_memory = 8.GB + } + withLabel: big_mem { + ext.base_memory = 10.GB + } + +} diff --git a/tests/test_pipeline.sh b/tests/test_pipeline.sh index 8180be7..c82e85d 100755 --- a/tests/test_pipeline.sh +++ b/tests/test_pipeline.sh @@ -33,7 +33,7 @@ chr1 3710 . T A 50 PASS . GT:GQ 1/1:0 EOT nextflow run ${SOURCE_DIR}/main.nf \ --config ${SCRIPT_DIR}/resources/config.yml \ +-config ${SCRIPT_DIR}/resources/nextflow.config \ --oldgenome ${SCRIPT_DIR}/resources/genome.fa \ --newgenome ${SCRIPT_DIR}/resources/new_genome.fa \ --vcffile ${SCRIPT_DIR}/resources/source.vcf \ diff --git a/tests/test_pipeline_empty.sh b/tests/test_pipeline_empty.sh index a40b4f4..c853cdc 100755 --- a/tests/test_pipeline_empty.sh +++ b/tests/test_pipeline_empty.sh @@ -20,11 +20,11 @@ cat << EOT > "${SCRIPT_DIR}/resources/source_empty.vcf" ##INFO= ##FORMAT= ##FORMAT= -#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT HG001 +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT HG001 EOT nextflow run ${SOURCE_DIR}/main.nf \ --config ${SCRIPT_DIR}/resources/config.yml \ +-config ${SCRIPT_DIR}/resources/nextflow.config \ --oldgenome ${SCRIPT_DIR}/resources/genome.fa \ --newgenome ${SCRIPT_DIR}/resources/new_genome.fa \ --vcffile ${SCRIPT_DIR}/resources/source_empty.vcf \ @@ -52,6 +52,7 @@ rm -rf work .nextflow* \ ${SCRIPT_DIR}/resources/source_empty.vcf \ ${SCRIPT_DIR}/resources/expected_remap.vcf \ ${SCRIPT_DIR}/resources/remap_empty.vcf \ + ${SCRIPT_DIR}/resources/remap_empty_nra_variants.vcf \ ${SCRIPT_DIR}/resources/remap_empty_counts.yml \ ${SCRIPT_DIR}/resources/remap_empty_unmapped.vcf \ ${SCRIPT_DIR}/resources/new_genome.fa.* \ diff --git a/variant_to_realignment.nf b/variant_to_realignment.nf index 5083809..6b49fdb 100755 --- a/variant_to_realignment.nf +++ b/variant_to_realignment.nf @@ -1,6 +1,5 @@ #!/usr/bin/env nextflow - // Enable syntax extension // See https://www.nextflow.io/docs/latest/dsl2.html nextflow.enable.dsl=2 @@ -11,6 +10,7 @@ nextflow.enable.dsl=2 * "strand" column. */ process convertVCFToBed { + label 'default_time', 'med_mem' input: path "source.vcf" @@ -38,6 +38,7 @@ process convertVCFToBed { * Based on variants BED, generate the BED file for each flank. */ process flankingRegionBed { + label 'default_time', 'med_mem' input: path "variants.bed" @@ -67,8 +68,7 @@ process flankingRegionBed { * Extract the actual flanking region in fasta format. */ process flankingRegionFasta { - - memory '4 GB' + label 'default_time', 'med_mem' input: path "flanking_r1.bed" @@ -91,8 +91,7 @@ process flankingRegionFasta { * Extract information about the original variants and put it in the fasta header */ process extractVariantInfoToFastaHeader { - - memory '6GB' + label 'default_time', 'med_mem' input: path "flanking_r1.bed" @@ -127,6 +126,7 @@ process extractVariantInfoToFastaHeader { * Split fasta entries into multiple chunks */ process split_fasta { + label 'short_time', 'small_mem' input: path interleaved_fasta @@ -150,13 +150,11 @@ process split_fasta { * Align sequence with minimap2 */ process alignWithMinimap { + label 'med_time' - // Memory required is 5 times the size of the fasta in Bytes or at least 1GB - // Retry on kill (exit status 130) with twice the amount of memory - memory { Math.max(file(params.newgenome).size() * 10, 2000000000) * task.attempt + ' B' } - - errorStrategy { task.exitStatus == 130 ? 'retry' : 'terminate' } - maxRetries 3 + // Memory required is 10 times the size of the fasta in Bytes or at least 2GB + // Overwrite base_memory so that the standard retry strategy is used + ext base_memory: { Math.max(file(params.newgenome).size() * 10, 2000000000) } input: // reads contains paired interleaved (first and second read in the same file) @@ -168,7 +166,6 @@ process alignWithMinimap { output: path "reads_aligned.bam", emit: reads_aligned_bam - script: if (flanklength < 500) """ @@ -199,6 +196,7 @@ process alignWithMinimap { * Sort BAM file by name */ process sortByName { + label 'default_time', 'med_mem' input: path "reads_aligned.bam" @@ -215,9 +213,11 @@ process sortByName { * Align sequence with bowtie2 */ process alignWithBowtie { + label 'med_time' // Memory required is 5 times the size of the fasta in Bytes or at least 1GB - memory Math.max(file(params.newgenome).size() * 5, 1073741824) + ' B' + // Overwrite base_memory so that the standard retry strategy is used + ext base_memory: { Math.max(file(params.newgenome).size() * 5, 1073741824) } input: path "variant_read1.fa" @@ -242,6 +242,7 @@ process alignWithBowtie { * Take the reads and process them to get the remapped variants */ process readsToRemappedVariants { + label 'default_time', 'med_mem' input: path "reads.bam" @@ -276,6 +277,8 @@ process readsToRemappedVariants { * */ process merge_variants { + label 'short_time', 'small_mem' + input: path "remapped*.vcf" path "unmapped*.vcf"