From 45aafb4e8d9cc1977d5dd51b07d1c402b90ca7ef Mon Sep 17 00:00:00 2001 From: Tom White Date: Wed, 28 Jul 2021 11:28:49 +0100 Subject: [PATCH] Truncate genotype calls that exceed max alt alleles --- sgkit/io/vcf/vcf_reader.py | 14 +++++++++++--- sgkit/tests/io/vcf/test_vcf_reader.py | 8 +++++++- sgkit/tests/io/vcf/test_vcf_roundtrip.py | 18 +++++++++++++----- 3 files changed, 31 insertions(+), 9 deletions(-) diff --git a/sgkit/io/vcf/vcf_reader.py b/sgkit/io/vcf/vcf_reader.py index 18ab43763..725032376 100644 --- a/sgkit/io/vcf/vcf_reader.py +++ b/sgkit/io/vcf/vcf_reader.py @@ -161,7 +161,7 @@ def for_field( ) -> "VcfFieldHandler": if field == "FORMAT/GT": return GenotypeFieldHandler( - vcf, chunk_length, ploidy, mixed_ploidy, truncate_calls + vcf, chunk_length, ploidy, mixed_ploidy, truncate_calls, max_alt_alleles ) category = field.split("/")[0] vcf_field_defs = _get_vcf_field_defs(vcf, category) @@ -286,6 +286,7 @@ def __init__( ploidy: int, mixed_ploidy: bool, truncate_calls: bool, + max_alt_alleles: int, ) -> None: n_sample = len(vcf.samples) self.call_genotype = np.empty((chunk_length, n_sample, ploidy), dtype="i1") @@ -293,6 +294,7 @@ def __init__( self.ploidy = ploidy self.mixed_ploidy = mixed_ploidy self.truncate_calls = truncate_calls + self.max_alt_alleles = max_alt_alleles def add_variant(self, i: int, variant: Any) -> None: fill = -2 if self.mixed_ploidy else -1 @@ -305,6 +307,10 @@ def add_variant(self, i: int, variant: Any) -> None: self.call_genotype[i, ..., 0:n] = gt[..., 0:n] self.call_genotype[i, ..., n:] = fill self.call_genotype_phased[i] = gt[..., -1] + + # set any calls that exceed maximum number of alt alleles as missing + self.call_genotype[i][self.call_genotype[i] > self.max_alt_alleles] = -1 + else: self.call_genotype[i] = fill self.call_genotype_phased[i] = 0 @@ -616,7 +622,8 @@ def vcf_to_zarrs( max_alt_alleles The (maximum) number of alternate alleles in the VCF file. Any records with more than this number of alternate alleles will have the extra alleles dropped (the `variant_allele` - variable will be truncated). Call genotype fields will however be unaffected. + variable will be truncated). Any call genotype fields with the extra alleles will + be changed to the missing-allele sentinel value of -1. fields Extra fields to extract data for. A list of strings, with ``INFO`` or ``FORMAT`` prefixes. Wildcards are permitted too, for example: ``["INFO/*", "FORMAT/DP"]``. @@ -784,7 +791,8 @@ def vcf_to_zarr( max_alt_alleles The (maximum) number of alternate alleles in the VCF file. Any records with more than this number of alternate alleles will have the extra alleles dropped (the `variant_allele` - variable will be truncated). Call genotype fields will however be unaffected. + variable will be truncated). Any call genotype fields with the extra alleles will + be changed to the missing-allele sentinel value of -1. fields Extra fields to extract data for. A list of strings, with ``INFO`` or ``FORMAT`` prefixes. Wildcards are permitted too, for example: ``["INFO/*", "FORMAT/DP"]``. diff --git a/sgkit/tests/io/vcf/test_vcf_reader.py b/sgkit/tests/io/vcf/test_vcf_reader.py index 9acb319db..2ed0ea55b 100644 --- a/sgkit/tests/io/vcf/test_vcf_reader.py +++ b/sgkit/tests/io/vcf/test_vcf_reader.py @@ -103,7 +103,10 @@ def test_vcf_to_zarr__max_alt_alleles(shared_datadir, is_path, tmp_path): output = tmp_path.joinpath("vcf.zarr").as_posix() with pytest.warns(MaxAltAllelesExceededWarning): - vcf_to_zarr(path, output, chunk_length=5, chunk_width=2, max_alt_alleles=1) + max_alt_alleles = 1 + vcf_to_zarr( + path, output, chunk_length=5, chunk_width=2, max_alt_alleles=max_alt_alleles + ) ds = xr.open_zarr(output) # extra alt alleles are dropped @@ -122,6 +125,9 @@ def test_vcf_to_zarr__max_alt_alleles(shared_datadir, is_path, tmp_path): ], ) + # genotype calls are truncated + assert np.all(ds["call_genotype"].values <= max_alt_alleles) + # the maximum number of alt alleles actually seen is stored as an attribute assert ds.attrs["max_alt_alleles_seen"] == 3 diff --git a/sgkit/tests/io/vcf/test_vcf_roundtrip.py b/sgkit/tests/io/vcf/test_vcf_roundtrip.py index f87f11b41..8ed2784c0 100644 --- a/sgkit/tests/io/vcf/test_vcf_roundtrip.py +++ b/sgkit/tests/io/vcf/test_vcf_roundtrip.py @@ -114,17 +114,23 @@ def test_DP_field(shared_datadir, tmpdir): @pytest.mark.parametrize( - "vcf_file,allel_exclude_fields,sgkit_exclude_fields", + "vcf_file,allel_exclude_fields,sgkit_exclude_fields,max_alt_alleles", [ - ("sample.vcf.gz", None, None), - ("mixed.vcf.gz", None, None), + ("sample.vcf.gz", None, None, 3), + ("mixed.vcf.gz", None, None, 3), # exclude PL since it has Number=G, which is not yet supported - ("CEUTrio.20.21.gatk3.4.g.vcf.bgz", ["calldata/PL"], ["FORMAT/PL"]), + # increase max_alt_alleles since scikit-allel does not truncate genotype calls + ("CEUTrio.20.21.gatk3.4.g.vcf.bgz", ["calldata/PL"], ["FORMAT/PL"], 7), ], ) @pytest.mark.filterwarnings("ignore::sgkit.io.vcf.MaxAltAllelesExceededWarning") def test_all_fields( - shared_datadir, tmpdir, vcf_file, allel_exclude_fields, sgkit_exclude_fields + shared_datadir, + tmpdir, + vcf_file, + allel_exclude_fields, + sgkit_exclude_fields, + max_alt_alleles, ): # change scikit-allel type defaults back to the VCF default types = { @@ -140,6 +146,7 @@ def test_all_fields( fields=["*"], exclude_fields=allel_exclude_fields, types=types, + alt_number=max_alt_alleles, ) field_defs = { @@ -159,6 +166,7 @@ def test_all_fields( exclude_fields=sgkit_exclude_fields, field_defs=field_defs, truncate_calls=True, + max_alt_alleles=max_alt_alleles, ) sg_ds = sg.load_dataset(str(sg_vcfzarr_path)) sg_ds = sg_ds.drop_vars("call_genotype_phased") # not included in scikit-allel