From 35b34284993d57645875ec05499f55a5ee30b88e Mon Sep 17 00:00:00 2001 From: skchronicles Date: Tue, 14 Jan 2025 18:06:44 -0500 Subject: [PATCH] Feat: Adding contig filter length, contig count step, and taxonomic viral table. --- metavirs | 19 ++++- workflow/rules/paired-end.smk | 132 ++++++++++++++++++++++++++++++++-- 2 files changed, 145 insertions(+), 6 deletions(-) diff --git a/metavirs b/metavirs index e910949..c063666 100755 --- a/metavirs +++ b/metavirs @@ -360,7 +360,7 @@ def parsed_arguments(name, description): {0}: {1} {3}Synopsis:{4} Run the pipeline with your raw data. - $ {2} run [--help] [--aggregate] \\ + $ {2} run [--help] [--aggregate] [--length-filter LENGTH_FILTER] \\ [--mode ] [--job-name JOB_NAME] \\ [--dry-run] [--silent] [--sif-cache SIF_CACHE] \\ [--singularity-cache SINGULARITY_CACHE] \\ @@ -393,6 +393,14 @@ def parsed_arguments(name, description): default, any resulting reports will be created at a per-sample level. Example: --aggregate + --length-filter LENGTH_FILTER + Filter contigs by total length (bp). The contig + length filter is used to remove any annotated + contigs less than this threshold. Annotating small + contigs can to lower-confidence classifications. + This filter is only applied to metaspades contigs, + default: 500. + Example: --length-filter 500 {3}Orchestration options:{4} --mode {{slurm,local}} @@ -552,6 +560,15 @@ def parsed_arguments(name, description): help = argparse.SUPPRESS ) + # Contig length filter + subparser_run.add_argument( + '--length-filter', + type = int, + required = False, + default = 500, + help = argparse.SUPPRESS + ) + # Execution Method, run locally # on a compute node, submit to # SLURM job scheduler, etc. diff --git a/workflow/rules/paired-end.smk b/workflow/rules/paired-end.smk index e74b1ff..201a5ae 100644 --- a/workflow/rules/paired-end.smk +++ b/workflow/rules/paired-end.smk @@ -470,6 +470,15 @@ rule metaspades: k2txt=join(workpath,"{name}","kraken2","{name}.metaspades.contigs.kraken2"), krona=join(workpath,"{name}","kraken2","{name}.metaspades.contigs.kraken2.krona"), tmp1=join(workpath,"{name}","temp","{name}.metaspades_kraken2.txt"), + ktaxids=join(workpath,"{name}","temp","{name}.metaspades_kraken2_taxid.txt"), + ktmp=join(workpath,"{name}","temp","{name}.metaspades_kraken2.temp"), + knames=join(workpath,"{name}","temp","{name}.metaspades_kraken2_names.txt"), + kcounts_temp=join(workpath,"{name}","temp","{name}.metaspades_kraken2_counts.temp"), + kcounts_txt=join(workpath,"{name}","temp","{name}.metaspades_kraken2_counts.txt"), + kflt=join(workpath,"{name}","temp","{name}.metaspades_kraken2_filtered.txt"), + ktaxlineage=join(workpath,"{name}","temp","{name}.metaspades_kraken2_taxlineage.tsv"), + kviral_unf=join(workpath,"{name}","temp","{name}.metaspades_kraken2_unfiltered_viraltable.tsv"), + kviral_flt=join(workpath,"{name}","temp","{name}.metaspades_kraken2_filtered_viraltable.tsv"), cat_class=join(workpath,"{name}","CAT","{name}.metaspades.contig2classification.txt"), cat_names=join(workpath,"{name}","CAT","{name}.metaspades.official_names.txt"), cat_summary=join(workpath,"{name}","CAT","{name}.metaspades.summary.txt"), @@ -484,6 +493,8 @@ rule metaspades: final=join(workpath,"{name}","output","{name}.metaspades.bam"), params: rname='metaspades', + filter_length=config['options']['length_filter'], + taxonkit_db=config['references']['taxonkit_db'], viral_db=config['references']['kraken2_viral_db'], cat_db=config['references']['CAT_db'], cat_tax=config['references']['CAT_taxonomy'], @@ -513,6 +524,8 @@ rule metaspades: if [ -d "{wildcards.name}/metaspades" ]; then rm -rf "{wildcards.name}/metaspades" fi + + # Assemble reads into contigs with metaspades metaspades.py -t {threads} \\ -m {params.memory} \\ {params.metaspades_sepe} {input.r1} {params.r2_option} \\ @@ -520,6 +533,7 @@ rule metaspades: mkdir -p {wildcards.name}/output mv {wildcards.name}/metaspades/contigs.fasta {output.contigs} + # Annotate the contigs with kraken2 kraken2 --threads {threads} \\ --db {params.viral_db} {output.contigs} \\ --report {output.report} \\ @@ -528,7 +542,6 @@ rule metaspades: {output.k2txt} > {output.krona} cp {output.krona} {output.tmp1} - # Check the number of assembled contigs # prior to running CAT. CAT contigs can # fail if this file is empty or only @@ -563,7 +576,6 @@ rule metaspades: echo "Please check the input and output of" echo "metaspades to troubleshoot the issue!" touch {output} - exit 0 }} # Try to grep for Virsuses, may pipefail @@ -587,6 +599,8 @@ rule metaspades: {output.tmp4} \\ > {output.cat_contigs} + # Build an index with assembled contigs + # and align reads against them bowtie2-build --threads {threads} \\ {output.contigs} \\ {wildcards.name}/temp/{wildcards.name}_metaspades @@ -594,7 +608,6 @@ rule metaspades: -x {wildcards.name}/temp/{wildcards.name}_metaspades \\ {params.bowtie2_sepe} {input.r1} {params.r2_option} \\ -S {output.sam} - samtools view -@ {threads} \\ -bS {output.sam} > {output.bam} samtools sort \\ @@ -603,6 +616,53 @@ rule metaspades: -o {output.final} samtools index \\ -@ {threads} {output.final} + + # Filter results based on contig length, + # get counts for the number of reads aligning + # to each contig, and create a counts table + # with counts and taxonomic classifications, + # Get taxids from intermediate results + cut -f2 {output.tmp1} \\ + > {output.ktaxids} + # Get contig names and coverage + cut -f1,3 {output.tmp1} \\ + > {output.ktmp} + # Get contig names + cut -f1 {output.tmp1} \\ + > {output.knames} + # Get counts of each contig in matching order + set +o pipefail # grep will fail -f file is empty + samtools idxstats {output.final} \\ + | grep -f {output.knames} \\ + | cut -f3 \\ + > {output.kcounts_temp} + set -o pipefail + # Create text file with contig name, + # coverage, and counts + paste \\ + {output.ktmp} \\ + {output.kcounts_temp} \\ + > {output.kcounts_txt} + # Create a unfiltered and contig length + # filtered table with taxonomic information + awk -F '_' -v OFS='\\t' '$4+0>={params.filter_length} {{print}}' \\ + > {output.kflt} + cat {output.ktaxids} \\ + | taxonkit reformat \\ + --data-dir {params.taxonkit_db} \\ + -I 1 \\ + -r "Unassigned" \\ + -f "{{k}}\\t{{p}}\\t{{c}}\\t{{o}}\\t{{f}}\\t{{g}}\\t{{s}}\\t{{t}}" \\ + > {output.ktaxlineage} + # Unfiltered viral taxonomic table + paste \\ + {output.kcounts_txt} \\ + {output.ktaxlineage} \\ + > {output.kviral_unf} + # Filtered viral taxonomic table + awk -F '_' -v OFS='\\t' \\ + '$4+0>={params.filter_length} {{print}}' {output.kviral_unf} \\ + > {output.kviral_flt} """ @@ -623,6 +683,15 @@ rule megahit: k2txt=join(workpath,"{name}","kraken2","{name}.megahit.contigs.kraken2"), krona=join(workpath,"{name}","kraken2","{name}.megahit.contigs.kraken2.krona"), tmp1=join(workpath,"{name}","temp","{name}.megahit_kraken2.txt"), + ktaxids=join(workpath,"{name}","temp","{name}.megahit_kraken2_taxid.txt"), + ktmp=join(workpath,"{name}","temp","{name}.megahit_kraken2.temp"), + knames=join(workpath,"{name}","temp","{name}.megahit_kraken2_names.txt"), + kcounts_temp=join(workpath,"{name}","temp","{name}.megahit_kraken2_counts.temp"), + kcounts_txt=join(workpath,"{name}","temp","{name}.megahit_kraken2_counts.txt"), + kflt=join(workpath,"{name}","temp","{name}.megahit_kraken2_filtered.txt"), + ktaxlineage=join(workpath,"{name}","temp","{name}.megahit_kraken2_taxlineage.tsv"), + kviral_unf=join(workpath,"{name}","temp","{name}.megahit_kraken2_unfiltered_viraltable.tsv"), + kviral_flt=join(workpath,"{name}","temp","{name}.megahit_kraken2_filtered_viraltable.tsv"), cat_class=join(workpath,"{name}","CAT","{name}.megahit.contig2classification.txt"), cat_names=join(workpath,"{name}","CAT","{name}.megahit.official_names.txt"), cat_summary=join(workpath,"{name}","CAT","{name}.megahit.summary.txt"), @@ -637,6 +706,8 @@ rule megahit: final=join(workpath,"{name}","output","{name}.megahit.bam"), params: rname='megahit', + filter_length=config['options']['length_filter'], + taxonkit_db=config['references']['taxonkit_db'], viral_db=config['references']['kraken2_viral_db'], cat_db=config['references']['CAT_db'], cat_tax=config['references']['CAT_taxonomy'], @@ -665,12 +736,15 @@ rule megahit: if [ -d "{wildcards.name}/megahit" ]; then rm -rf "{wildcards.name}/megahit" fi + + # Assemble reads into contigs with megahit megahit -t {threads} \\ {params.megahit_sepe} {input.r1} {params.r2_option} \\ --out-prefix=megahit \\ -o {wildcards.name}/megahit tr ' ' '_' < {wildcards.name}/megahit/megahit.contigs.fa > {output.contigs} + # Annotate the contigs with kraken2 kraken2 --threads {threads} \\ --db {params.viral_db} {output.contigs} \\ --report {output.report} \\ @@ -715,7 +789,6 @@ rule megahit: echo "Please check the input and output of" echo "megahit to troubleshoot the issue!" touch {output} - exit 0 }} # Try to grep for Virsuses, may pipefail @@ -740,6 +813,8 @@ rule megahit: {output.tmp4} \\ > {output.cat_contigs} + # Build an index with assembled contigs + # and align reads against them bowtie2-build --threads {threads} \\ {output.contigs} \\ {wildcards.name}/temp/{wildcards.name}_megahit @@ -747,7 +822,6 @@ rule megahit: -x {wildcards.name}/temp/{wildcards.name}_megahit \\ {params.bowtie2_sepe} {input.r1} {params.r2_option} \\ -S {output.sam} - samtools view -@ {threads} \\ -bS {output.sam} > {output.bam} samtools sort \\ @@ -756,6 +830,53 @@ rule megahit: -o {output.final} samtools index \\ -@ {threads} {output.final} + + # Filter results based on contig length, + # get counts for the number of reads aligning + # to each contig, and create a counts table + # with counts and taxonomic classifications, + # Get taxids from intermediate results + cut -f2 {output.tmp1} \\ + > {output.ktaxids} + # Get contig names and coverage + cut -f1,3 {output.tmp1} \\ + > {output.ktmp} + # Get contig names + cut -f1 {output.tmp1} \\ + > {output.knames} + # Get counts of each contig in matching order + set +o pipefail # grep will fail -f file is empty + samtools idxstats {output.final} \\ + | grep -f {output.knames} \\ + | cut -f3 \\ + > {output.kcounts_temp} + set -o pipefail + # Create text file with contig name, + # coverage, and counts + paste \\ + {output.ktmp} \\ + {output.kcounts_temp} \\ + > {output.kcounts_txt} + # Create a unfiltered and contig length + # filtered table with taxonomic information + awk -F '\\t' '{{split($1,a,"="); if (a[4] >= {params.filter_length}) print $0}}' \\ + > {output.kflt} + cat {output.ktaxids} \\ + | taxonkit reformat \\ + --data-dir {params.taxonkit_db} \\ + -I 1 \\ + -r "Unassigned" \\ + -f "{{k}}\\t{{p}}\\t{{c}}\\t{{o}}\\t{{f}}\\t{{g}}\\t{{s}}\\t{{t}}" \\ + > {output.ktaxlineage} + # Unfiltered viral taxonomic table + paste \\ + {output.kcounts_txt} \\ + {output.ktaxlineage} \\ + > {output.kviral_unf} + # Filtered viral taxonomic table + awk -F '\\t' '{{split($1,a,"="); if (a[4] >= {params.filter_length}) print $0}}' \\ + {output.kviral_unf} \\ + > {output.kviral_flt} """ @@ -1189,6 +1310,7 @@ rule prep_aggregate_megahit_scatter: > {output.f2} """ + rule prep_aggregate_metaspades_gather: """ Data-processing step to prep input for the aggregated Krona report.