Add VEP annotation tool

moiexpositoalonsolab · May 24, 2021 · 005b7e6 · 005b7e6
1 parent 49b37b3
commit 005b7e6
Show file tree

Hide file tree

Showing 10 changed files with 261 additions and 6 deletions.
diff --git a/README.md b/README.md
@@ -43,6 +43,7 @@ Pipeline Overview
     - [QualiMap](http://qualimap.conesalab.org/)
     - [Picard CollectMultipleMetrics](https://gatk.broadinstitute.org/hc/en-us/articles/360042478112-CollectMultipleMetrics-Picard-)
     - [snpEff](https://pcingola.github.io/SnpEff/)
+    - [VEP](https://uswest.ensembl.org/info/docs/tools/vep/index.html) (Ensembl Variant Effect Predictor)
     - [MultiQC](https://multiqc.info/)
 
 **Output:**

diff --git a/Snakefile b/Snakefile
@@ -18,7 +18,8 @@ rule all:
         # Basic steps
         "genotyped/all.vcf.gz",
         "filtered/all.vcf.gz",
-        "annotated/all.vcf.gz" if config["settings"]["snpeff"] else [],
+        "annotated/snpeff.vcf.gz" if config["settings"]["snpeff"] else [],
+        "annotated/vep.vcf.gz" if config["settings"]["vep"] else [],
 
         # Quality control
         "qc/multiqc.html",

diff --git a/config.yaml b/config.yaml
@@ -76,6 +76,13 @@ settings:
   # as otherwise the whole pipeline will be run regardless.
   snpeff: true
 
+  # Set to true in order to annotate the variant calls with VEP.
+  # If used, make sure to set the correct species entry for VEP below in the params,
+  # and all other desired settings and plugins for VEP.
+  # Set to false when using the workflow to only obtain quality control statistics,
+  # as otherwise the whole pipeline will be run regardless.
+  vep: true
+
   # Set to true in order to run mapDamage.
   mapdamage: false
 
@@ -402,6 +409,58 @@ params:
     # Additional parameters for snpeff, see https://pcingola.github.io/SnpEff/se_commandline/
     extra: "-Xmx4g"
 
+  # ----------------------------------------------------------------------
+  #     VEP
+  # ----------------------------------------------------------------------
+
+  # Used only if settings:vep == true
+  vep:
+
+    # The VEP documentation is not really helpful when trying to find the cache sources
+    # for anything other than the Homo sapiens data. The list of Ensembl genomes can be found
+    # here: https://uswest.ensembl.org/info/docs/tools/vep/script/vep_cache.html,
+    # but it can be a bit tricky to find the exact names and the FTP download URL that need to be
+    # used here (see params below for details).
+    # Note that there seem to be two version numbers that are for different parts of their
+    # ecosystem, e.g., Ensembl release 104 / Ensembl Genomes 51, which is confusing.
+
+    # Ensembl species name
+    species: "arabidopsis_thaliana"
+
+    # Genome build
+    build: "TAIR10"
+
+    # Ensembl release version.
+    # Used for the species cache version and for the plugins download.
+    release: 104
+
+    # For non-metazoans, we need different download URLs for obtaining the Ensembl data,
+    # but it does not seem to be documented well where to find those URLs.
+    # Try http://uswest.ensembl.org/info/docs/tools/vep/script/vep_download.html#installer,
+    # and http://uswest.ensembl.org/info/docs/tools/vep/script/vep_cache.html#cache
+    # which list FTP directories for the Ensembl Genomes.
+    # Alternatively, try to find your desired species here: http://ftp.ebi.ac.uk/ensemblgenomes/pub/
+    # Then, use the FTP url here (instead of tht http URL) so that we can find and download the
+    # above species data.
+    # If left empty, we use the default URL as used by the VEP install script.
+    cache-url: "ftp://ftp.ebi.ac.uk/ensemblgenomes/pub/plants/current/variation/vep"
+
+    # Add any plugin from https://www.ensembl.org/info/docs/tools/vep/script/vep_plugins.html
+    # Plugin arguments can be passed as well, e.g. "LoFtool,path/to/custom/scores.txt",
+    # or via an entry "MyPlugin,1,FOO", see the documentation linked above.
+    plugins:
+      - LoFtool
+
+    # Set the directories to download the VEP species/cache/database and plugins to.
+    # In many cases, this helps to avoid re-downloaded the data for every run of grenepipe.
+    # Hence we recommend to use absolute paths!
+    # If left blank, we use the same directory where the reference genome is located.
+    cache-dir: ""
+    plugins-dir: ""
+
+    # Extra command line arguments (e.g. --sift, see docs) for the VEP annotation.
+    extra: ""
+
   # ----------------------------------------------------------------------
   #     mapdamage
   # ----------------------------------------------------------------------

diff --git a/envs/vep.yaml b/envs/vep.yaml
@@ -0,0 +1,11 @@
+channels:
+  - bioconda
+  - conda-forge
+dependencies:
+  # This version should not be changed unless the vep_install command needs an
+  # update or suddenly stops working.
+  - ensembl-vep =104
+  - bcftools =1.10
+  - python
+  - numpy
+  - pandas
diff --git a/reports/stats.rst b/reports/stats.rst
@@ -0,0 +1 @@
+Statistics on variant calls, as calculated by VEP.
diff --git a/rules/annotation.smk b/rules/annotation.smk
@@ -43,13 +43,20 @@ rule snpeff:
         db=get_snpeff_db_path() + config["params"]["snpeff"]["name"]
     output:
         # annotated calls (vcf, bcf, or vcf.gz)
-        calls=report("annotated/all.vcf.gz", caption="../reports/vcf.rst", category="Calls"),
+        calls=report(
+            "annotated/snpeff.vcf.gz",
+            caption="../reports/vcf.rst",
+            category="Calls"
+        ),
 
         # summary statistics (in HTML), optional
-        stats=report("snpeff/all.html", category="Calls"),
+        stats=report(
+            "annotated/snpeff.html",
+            category="Calls"
+        ),
 
         # summary statistics in CSV, optional
-        csvstats="snpeff/all.csv"
+        csvstats="annotated/snpeff.csv"
     log:
         "logs/snpeff.log"
     group:
@@ -60,3 +67,117 @@ rule snpeff:
         extra=config["params"]["snpeff"]["extra"]
     wrapper:
         "0.74.0/bio/snpeff/annotate"
+
+# =================================================================================================
+#     VEP Downloads
+# =================================================================================================
+
+# Get the paths where we store the data, so that we do not have to download it all the time.
+# If not provided by the user, we use the directory where the reference genome is.
+def get_vep_cache_dir():
+    if config["params"]["vep"]["cache-dir"]:
+        result = os.path.dirname( config["params"]["vep"]["cache-dir"] )
+    else:
+        result = os.path.join(
+            os.path.dirname( config["data"]["reference"]["genome"] ), "vep-cache"
+        )
+    # VEP is super stupid and cannot create directories.
+    # So let's do the job here.
+    # if not os.path.exists(result):
+    #     os.makedirs(result)
+    return result
+
+# Same for plugins dir.
+def get_vep_plugins_dir():
+    if config["params"]["vep"]["plugins-dir"]:
+        result = os.path.dirname( config["params"]["vep"]["plugins-dir"] )
+    else:
+        result = os.path.join(
+            os.path.dirname( config["data"]["reference"]["genome"] ), "vep-plugins"
+        )
+    # if not os.path.exists(result):
+    #     os.makedirs(result)
+    return result
+
+rule vep_cache:
+    output:
+        directory(get_vep_cache_dir()),
+    params:
+        species  = config["params"]["vep"]["species"],
+        release  = config["params"]["vep"]["release"],
+        build    = config["params"]["vep"]["build"],
+        cacheurl = config["params"]["vep"]["cache-url"],
+        # fastaurl = config["params"]["vep"]["fasta-url"],
+        # fasta-url: "ftp://ftp.ebi.ac.uk/ensemblgenomes/pub/plants/current/fasta"
+    log:
+        "logs/vep-cache.log",
+    conda:
+        # We use a conda environment on top of the wrapper, as the wrapper always causes
+        # issues with missing python modules and mismatching program versions and stuff...
+        # Here, numpy is missing.
+        # Well no, we don't even use the wrapper any more, but our own improved version...
+        "../envs/vep.yaml"
+    script:
+        "../scripts/vep-cache.py"
+    # wrapper:
+    #     "0.74.0/bio/vep/cache"
+
+rule vep_plugins:
+    output:
+        directory(get_vep_plugins_dir()),
+    params:
+        release = config["params"]["vep"]["release"],
+    log:
+        "logs/vep-plugins.log",
+    conda:
+        # Use our own env definition here, to ensure that we are working with the same vep
+        # versions across the different rules here. This is not the case in the original wrapper...
+        "../envs/vep.yaml"
+    wrapper:
+        "0.74.0/bio/vep/plugins"
+
+# Rules are not submitted as a job to the cluster.
+localrules: vep_cache, vep_plugins
+
+# =================================================================================================
+#     VEP
+# =================================================================================================
+
+rule vep:
+    input:
+        calls="filtered/all.vcf.gz",
+        cache=get_vep_cache_dir(),
+        plugins=get_vep_plugins_dir(),
+    output:
+        calls=report(
+            "annotated/vep.vcf.gz",
+            caption="../report/vcf.rst",
+            category="Calls",
+        ),
+        stats=report(
+            # The html file sas to have a specific file name, so that MultiQC can find it,
+            # see https://multiqc.info/docs/#vep
+            "annotated/vep_summary.html",
+            caption="../report/stats.rst",
+            category="Calls",
+        ),
+    params:
+        # Pass a list of plugins to use,
+        # see https://www.ensembl.org/info/docs/tools/vep/script/vep_plugins.html
+        # Plugin args can be added as well, e.g. via an entry "MyPlugin,1,FOO", see docs.
+        plugins=config["params"]["vep"]["plugins"],
+        extra=config["params"]["vep"]["extra"],
+    log:
+        "logs/vep-annotate.log",
+    threads: 4
+    conda:
+        # Use our own env definition here, to ensure that we are working with the same vep
+        # versions across the different rules here. This is not the case in the original wrapper...
+        "../envs/vep.yaml"
+    # script:
+    #     "../scripts/vep.py"
+    # The original snakemake wrapper fails for the Arabidopsis thaliana genome because of an
+    # unnecessary/wrong error check. It just wants there to be one file in the database,
+    # when VEP apparently also can produce multiple files/directories in the cache.
+    wrapper:
+        "0.74.0/bio/vep/annotate"
diff --git a/rules/qc.smk b/rules/qc.smk
@@ -239,7 +239,8 @@ rule multiqc:
         ),
 
         # Annotation
-        "snpeff/all.csv" if config["settings"]["snpeff"] else [],
+        "annotated/snpeff.csv" if config["settings"]["snpeff"] else [],
+        "annotated/vep_summary.html" if config["settings"]["vep"] else [],
 
         # Damage
         expand(

diff --git a/rules/stats.smk b/rules/stats.smk
@@ -4,7 +4,7 @@
 
 rule vcf_to_tsv:
     input:
-        "annotated/all.vcf.gz"
+        "annotated/snpeff.vcf.gz"
     output:
         report("tables/calls.tsv.gz", caption="../reports/calls.rst", category="Calls")
     log:

diff --git a/schemas/config.schema.yaml b/schemas/config.schema.yaml
@@ -53,6 +53,8 @@ properties:
         type: integer
       snpeff:
         type: boolean
+      vep:
+        type: boolean
       mapdamage:
         type: boolean
       damageprofiler:
@@ -69,6 +71,7 @@ properties:
       - recalibrate-base-qualities
       - calling-tool
       - snpeff
+      - vep
       - mapdamage
       - damageprofiler
       - frequency-table
@@ -264,6 +267,13 @@ properties:
             type: string
           extra:
             type: string
+      snpeff:
+        type: object
+        properties:
+          species:
+            type: string
+          extra:
+            type: string
       mapdamage:
         type: object
         properties:
@@ -298,6 +308,7 @@ properties:
       - fastqc
       - qualimap
       - snpeff
+      - vep
       - mapdamage
       - damageprofiler
       - multiqc

diff --git a/scripts/vep-cache.py b/scripts/vep-cache.py
@@ -0,0 +1,49 @@
+# =================================================================================================
+#     README
+# =================================================================================================
+
+# Snakemake wrapper for vep cache, adapted from the wrapper script at
+# https://github.com/snakemake/snakemake-wrappers/blob/master/bio/vep/cache/wrapper.py in version 0.74.0,
+# https://github.com/snakemake/snakemake-wrappers/blob/6b71f64fba7ee2c6cad31315d9ccb1ed26c4605c/bio/vep/cache/wrapper.py
+#
+# We here fix some issues with the original script before the wrapper is fixed.
+# In particular, we make the requirement that fasta is downloaded optional,
+# and add the capability to set download URLs for the cache and fasta files.
+
+# =================================================================================================
+#     VEP Cache
+# =================================================================================================
+
+from pathlib import Path
+from snakemake.shell import shell
+
+# Get params. By default, we run only cache (--AUTO c), unlike the original wrapper,
+# which also requestd fasta (--AUTO cf), which would then mess up the check that the
+# subdirectory of the cache contains a single directory that is done in the vep annotation wrapper.
+# See https://github.com/snakemake/snakemake-wrappers/issues/365
+automode = snakemake.params.get("automode", "c")
+extra = snakemake.params.get("extra", "")
+
+# Extra optional cache and fasta url
+cacheurl = snakemake.params.get("cacheurl", "")
+if cacheurl:
+    cacheurl = "--CACHEURL \"{}\"".format(cacheurl)
+fastaurl = snakemake.params.get("fastaurl", "")
+if fastaurl:
+    fastaurl = "--FASTAURL \"{}\"".format(fastaurl)
+
+log = snakemake.log_fmt_shell(stdout=True, stderr=True)
+
+# Compared to the original wrapper, we add the two urls, and also use a newer version
+# of vep install, which uses --CACHE_VERSION instead of --VERSION.
+shell(
+    "vep_install --AUTO {automode} "
+    "--SPECIES {snakemake.params.species} "
+    "--ASSEMBLY {snakemake.params.build} "
+    "--CACHE_VERSION {snakemake.params.release} "
+    "--CACHEDIR {snakemake.output} "
+    "--CONVERT "
+    "--NO_UPDATE "
+    "{cacheurl} {fastaurl} "
+    "{extra} {log}"
+)
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		Statistics on variant calls, as calculated by VEP.