Skip to content

Commit

Permalink
Truncate genotype calls that exceed max alt alleles
Browse files Browse the repository at this point in the history
  • Loading branch information
tomwhite committed Jul 29, 2021
1 parent 71e8afa commit 45aafb4
Show file tree
Hide file tree
Showing 3 changed files with 31 additions and 9 deletions.
14 changes: 11 additions & 3 deletions sgkit/io/vcf/vcf_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,7 +161,7 @@ def for_field(
) -> "VcfFieldHandler":
if field == "FORMAT/GT":
return GenotypeFieldHandler(
vcf, chunk_length, ploidy, mixed_ploidy, truncate_calls
vcf, chunk_length, ploidy, mixed_ploidy, truncate_calls, max_alt_alleles
)
category = field.split("/")[0]
vcf_field_defs = _get_vcf_field_defs(vcf, category)
Expand Down Expand Up @@ -286,13 +286,15 @@ def __init__(
ploidy: int,
mixed_ploidy: bool,
truncate_calls: bool,
max_alt_alleles: int,
) -> None:
n_sample = len(vcf.samples)
self.call_genotype = np.empty((chunk_length, n_sample, ploidy), dtype="i1")
self.call_genotype_phased = np.empty((chunk_length, n_sample), dtype=bool)
self.ploidy = ploidy
self.mixed_ploidy = mixed_ploidy
self.truncate_calls = truncate_calls
self.max_alt_alleles = max_alt_alleles

def add_variant(self, i: int, variant: Any) -> None:
fill = -2 if self.mixed_ploidy else -1
Expand All @@ -305,6 +307,10 @@ def add_variant(self, i: int, variant: Any) -> None:
self.call_genotype[i, ..., 0:n] = gt[..., 0:n]
self.call_genotype[i, ..., n:] = fill
self.call_genotype_phased[i] = gt[..., -1]

# set any calls that exceed maximum number of alt alleles as missing
self.call_genotype[i][self.call_genotype[i] > self.max_alt_alleles] = -1

else:
self.call_genotype[i] = fill
self.call_genotype_phased[i] = 0
Expand Down Expand Up @@ -616,7 +622,8 @@ def vcf_to_zarrs(
max_alt_alleles
The (maximum) number of alternate alleles in the VCF file. Any records with more than
this number of alternate alleles will have the extra alleles dropped (the `variant_allele`
variable will be truncated). Call genotype fields will however be unaffected.
variable will be truncated). Any call genotype fields with the extra alleles will
be changed to the missing-allele sentinel value of -1.
fields
Extra fields to extract data for. A list of strings, with ``INFO`` or ``FORMAT`` prefixes.
Wildcards are permitted too, for example: ``["INFO/*", "FORMAT/DP"]``.
Expand Down Expand Up @@ -784,7 +791,8 @@ def vcf_to_zarr(
max_alt_alleles
The (maximum) number of alternate alleles in the VCF file. Any records with more than
this number of alternate alleles will have the extra alleles dropped (the `variant_allele`
variable will be truncated). Call genotype fields will however be unaffected.
variable will be truncated). Any call genotype fields with the extra alleles will
be changed to the missing-allele sentinel value of -1.
fields
Extra fields to extract data for. A list of strings, with ``INFO`` or ``FORMAT`` prefixes.
Wildcards are permitted too, for example: ``["INFO/*", "FORMAT/DP"]``.
Expand Down
8 changes: 7 additions & 1 deletion sgkit/tests/io/vcf/test_vcf_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,10 @@ def test_vcf_to_zarr__max_alt_alleles(shared_datadir, is_path, tmp_path):
output = tmp_path.joinpath("vcf.zarr").as_posix()

with pytest.warns(MaxAltAllelesExceededWarning):
vcf_to_zarr(path, output, chunk_length=5, chunk_width=2, max_alt_alleles=1)
max_alt_alleles = 1
vcf_to_zarr(
path, output, chunk_length=5, chunk_width=2, max_alt_alleles=max_alt_alleles
)
ds = xr.open_zarr(output)

# extra alt alleles are dropped
Expand All @@ -122,6 +125,9 @@ def test_vcf_to_zarr__max_alt_alleles(shared_datadir, is_path, tmp_path):
],
)

# genotype calls are truncated
assert np.all(ds["call_genotype"].values <= max_alt_alleles)

# the maximum number of alt alleles actually seen is stored as an attribute
assert ds.attrs["max_alt_alleles_seen"] == 3

Expand Down
18 changes: 13 additions & 5 deletions sgkit/tests/io/vcf/test_vcf_roundtrip.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,17 +114,23 @@ def test_DP_field(shared_datadir, tmpdir):


@pytest.mark.parametrize(
"vcf_file,allel_exclude_fields,sgkit_exclude_fields",
"vcf_file,allel_exclude_fields,sgkit_exclude_fields,max_alt_alleles",
[
("sample.vcf.gz", None, None),
("mixed.vcf.gz", None, None),
("sample.vcf.gz", None, None, 3),
("mixed.vcf.gz", None, None, 3),
# exclude PL since it has Number=G, which is not yet supported
("CEUTrio.20.21.gatk3.4.g.vcf.bgz", ["calldata/PL"], ["FORMAT/PL"]),
# increase max_alt_alleles since scikit-allel does not truncate genotype calls
("CEUTrio.20.21.gatk3.4.g.vcf.bgz", ["calldata/PL"], ["FORMAT/PL"], 7),
],
)
@pytest.mark.filterwarnings("ignore::sgkit.io.vcf.MaxAltAllelesExceededWarning")
def test_all_fields(
shared_datadir, tmpdir, vcf_file, allel_exclude_fields, sgkit_exclude_fields
shared_datadir,
tmpdir,
vcf_file,
allel_exclude_fields,
sgkit_exclude_fields,
max_alt_alleles,
):
# change scikit-allel type defaults back to the VCF default
types = {
Expand All @@ -140,6 +146,7 @@ def test_all_fields(
fields=["*"],
exclude_fields=allel_exclude_fields,
types=types,
alt_number=max_alt_alleles,
)

field_defs = {
Expand All @@ -159,6 +166,7 @@ def test_all_fields(
exclude_fields=sgkit_exclude_fields,
field_defs=field_defs,
truncate_calls=True,
max_alt_alleles=max_alt_alleles,
)
sg_ds = sg.load_dataset(str(sg_vcfzarr_path))
sg_ds = sg_ds.drop_vars("call_genotype_phased") # not included in scikit-allel
Expand Down

0 comments on commit 45aafb4

Please sign in to comment.