Merge pull request #4 from tskir/eva-1937-repeat-expansion-tests

EVA-1937 — Tests for the repeat expansion pipeline
EBIvariation · Apr 21, 2020 · 83f82ee · 83f82ee
2 parents 64a0b8a + 8043646
commit 83f82ee
Show file tree

Hide file tree

Showing 16 changed files with 698 additions and 13 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,3 @@
+.pytest_cache
+**/__pycache__
+*.egg-info
diff --git a/.idea/dictionaries/ktsukanov.xml b/.idea/dictionaries/ktsukanov.xml
diff --git a/.idea/other.xml b/.idea/other.xml
diff --git a/.travis.yml b/.travis.yml
@@ -1,22 +1,23 @@
+dist: bionic
 language: python
 python:
   - "3.8"
 
 # The pipeline requires GNU parallel; bcftools; and certain Python modules
-before_install:
-  - sudo apt-get -y install parallel
 install:
+  - sudo apt update
+  - sudo apt -y install samtools bcftools parallel libbz2-dev liblzma-dev
   - pip -q install -r requirements.txt
-  - git clone -q git://github.com/samtools/htslib.git
-  - git clone -q git://github.com/samtools/bcftools.git
-  - cd bcftools && make --quiet --jobs `nproc` && cd ..
-  - export PATH=$PATH:bcftools
 
 # For the actual test, we're running a set of 2,000 ClinVar variants through VEP and comparing the result with the
 # expected one (diff will exit with return code 0 if the files are identical, and with 1 otherwise). Of course, this
 # means that when VEP updates, the test will break; however, this is exactly the intention, as in this case we will be
 # able to compare the results and see if they make sense.
 script:
-  - ls
+  - echo 'Test 1. VEP mapping pipeline'
   - bash run_consequence_mapping.sh vep_mapping_pipeline/test/input.vcf output_mappings.tsv
   - diff vep_mapping_pipeline/test/output_mappings.tsv output_mappings.tsv
+
+  - echo 'Test 2. Repeat expansion pipeline'
+  - pip install --editable .
+  - pytest
diff --git a/README.md b/README.md
@@ -10,11 +10,11 @@ Please see the corresponding module README file for more information.
 
 ## Installing requirements
 
-The commands below has been tested for Ubuntu 19.10. You might have to adjust commands and package names if you're using
-a different distribution. Note in particular that some older Debian and Ubuntu distrubutions include ancient htslib/
-samtools/bcftools versions.
+The commands below has been tested for Ubuntu 18.04 and newer. You might have to adjust commands and package names if
+you're using a different distribution. Note in particular that some older Debian and Ubuntu distributions include
+ancient htslib/samtools/bcftools versions, which will not work.
 
 ```bash
 sudo apt -y install samtools bcftools parallel libbz2-dev liblzma-dev
-sudo pip3 -q install -r requirements.txt
+sudo python3 -m pip -q install -r requirements.txt
 ```
diff --git a/__init__.py b/__init__.py
diff --git a/repeat_expansion_variants/clinvar_identifier_parsing.py b/repeat_expansion_variants/clinvar_identifier_parsing.py
@@ -4,10 +4,15 @@
 module imposes strict validation. Hence, custom regular expressions are necessary.
 """
 
+import logging
 import re
 
 from Bio.Alphabet.IUPAC import IUPACAmbiguousDNA
 
+logging.basicConfig()
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+
 
 # Common part for all HGVS-like transcript definitions, e.g. 'NM_001256054.2(C9orf72):'
 hgvs_like_transcript_part = (
@@ -74,7 +79,7 @@
     r'\('
     r'(?P<sequence>[{}]+)'.format(IUPACAmbiguousDNA.letters) +
     r'\)n'
-    r' REPEAT EXPANSION'
+    r'(?: REPEAT)? EXPANSION'
 )
 
 
@@ -114,4 +119,7 @@ def parse_variant_identifier(variant_name):
     match = re_description.search(variant_name)
     if match:
         repeat_unit_length = len(match.group('sequence'))
+        return transcript_id, coordinate_span, repeat_unit_length, is_protein_hgvs
+
+    logger.warning('ClinVar identifier did not match any of the regular expressions: {}'.format(variant_name))
     return transcript_id, coordinate_span, repeat_unit_length, is_protein_hgvs
diff --git a/repeat_expansion_variants/pipeline.py b/repeat_expansion_variants/pipeline.py
@@ -149,12 +149,13 @@ def generate_output_files(variants, output_consequences, output_dataframe):
 
     # Rearrange order of dataframe columns
     variants = variants[
-        ['Name', 'RCVaccession', 'HGNC_ID', 'GeneSymbol',
+        ['Name', 'RCVaccession', 'GeneSymbol', 'HGNC_ID',
          'RepeatUnitLength', 'CoordinateSpan', 'IsProteinHGVS', 'TranscriptID',
          'EnsemblGeneID', 'EnsemblGeneName', 'GeneAnnotationSource',
          'RepeatType', 'RecordIsComplete']
     ]
     # Write the full dataframe. This is used for debugging and investigation purposes.
+    variants.sort_values(by=['Name', 'RCVaccession', 'GeneSymbol'])
     variants.to_csv(output_dataframe, sep='\t', index=False)
 
     # Generate consequences table

diff --git a/repeat_expansion_variants/test/README.md b/repeat_expansion_variants/test/README.md
@@ -0,0 +1,15 @@
+# Tests for the repeat expansion pipeline
+
+ClinVar repeat expansion data includes a number of peculiarities. To check them all in separate unit tests would be
+expensive to develop and maintain. Hence, the pipeline uses a hybrid integration test with an annotated dataset.
+
+The dataset includes the input file [`input_variant_summary.tsv`](input_variant_summary.tsv) and two expected output
+files: [`output_dataframe.tsv`](output_dataframe.tsv) and [`output_consequences.tsv`](output_consequences.tsv). The
+input file is not a sample, but rather a complete selection of “NT expansion” variants from ClinVar data as of
+2020-04-08. The expected output files were produced by the pipeline and checked manually for correctness. The idea
+behind including the entire dataset is that it will make the tests sensitive to even minor changes.
+
+The test files are annotated using comments, which are removed by the testing function prior to using those files. The records of special interest are listed on top, and their peculiarities are documented. This allows to trace the fate
+of each such record from input to full dataframe to the collapsed final output.
+
+In addition to the hybrid integration test, the code of the pipeline itself performs sanity checks whenever possible.
diff --git a/repeat_expansion_variants/test/__init__.py b/repeat_expansion_variants/test/__init__.py