From 707336bf556ccbac96ce70dd6a5bb04529122391 Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Thu, 2 Jan 2025 17:33:43 +0100 Subject: [PATCH] Make it possible to refer to the ID column from the FORMAT expression For example bcftools query test.vcf -f 'ID=%ID ID=[ %/ID] vs FMT_ID=[ %ID]' The same is now possible for CHROM, POS, REF, etc Resolves #2337 --- NEWS | 4 ++++ convert.c | 17 +++++++++++++++-- doc/bcftools.txt | 3 +++ test/query.3.1.out | 1 + test/query.3.2.out | 1 + test/query.3.3.out | 1 + test/query.3.vcf | 20 ++++++++++++++++++++ test/test.pl | 3 +++ 8 files changed, 48 insertions(+), 2 deletions(-) create mode 100644 test/query.3.1.out create mode 100644 test/query.3.2.out create mode 100644 test/query.3.3.out create mode 100644 test/query.3.vcf diff --git a/NEWS b/NEWS index eee97954..d2bd2eac 100644 --- a/NEWS +++ b/NEWS @@ -43,6 +43,10 @@ Changes affecting specific commands: bcftools query test.vcf -f '%CHROM:%POS \t [ %AD] \t %SUM(FMT/AD)' bcftools query test.vcf -f '%CHROM:%POS \t [ %AD] \t %SUM(INFO/AD)' + - Make it possible to refer to the ID column from the FORMAT expression (#2337) + + bcftools query test.vcf -f 'ID=%ID ID=[ %/ID] vs FMT_ID=[ %ID]' + * bcftools roh - New visualization tool misc/roh-viz, see below diff --git a/convert.c b/convert.c index bdd772c7..ea51a4be 100644 --- a/convert.c +++ b/convert.c @@ -1,6 +1,6 @@ /* convert.c -- functions for converting between VCF/BCF and related formats. - Copyright (C) 2013-2024 Genome Research Ltd. + Copyright (C) 2013-2025 Genome Research Ltd. Author: Petr Danecek @@ -1466,12 +1466,25 @@ static int parse_subscript(char **p) static char *parse_tag(convert_t *convert, char *p, int is_gtf) { + int is_vcf_column = p[1]=='/' ? 1 : 0; + if ( is_vcf_column ) p++; + char *q = ++p; while ( *q && (isalnum(*q) || *q=='_' || *q=='.') ) q++; kstring_t str = {0,0,0}; if ( q-p==0 ) error("Could not parse format string: %s\n", convert->format_str); kputsn(p, q-p, &str); - if ( is_gtf ) + if ( is_gtf && is_vcf_column ) + { + _SET_NON_FORMAT_TAGS(register_tag, str.s, convert, str.s, is_gtf) + else if ( !strcmp(str.s, "ALT") ) + { + fmt_t *fmt = register_tag(convert, str.s, is_gtf, T_ALT); + fmt->subscript = parse_subscript(&q); + } + else error("Could not parse tag: %s .. %s\n", str.s,convert->format_str); + } + else if ( is_gtf ) { _SET_FILTER_EXPR(convert,set_filter_expr,p,q,1) else if ( !strcmp(str.s, "SAMPLE") ) register_tag(convert, "SAMPLE", is_gtf, T_SAMPLE); diff --git a/doc/bcftools.txt b/doc/bcftools.txt index 713a1902..932258ba 100644 --- a/doc/bcftools.txt +++ b/doc/bcftools.txt @@ -3253,6 +3253,9 @@ Extracts fields from VCF or BCF files and outputs them in user-defined format. bcftools query -f '[%SAMPLE %GT %DP\n]' -i 'FMT/DP=1 || FMT/DP=2' file.vcf bcftools query -f '[%SAMPLE %GT %DP\n]' -i 'FMT/DP=1 | FMT/DP=2' file.vcf + # Refer to ID column vs INFO/ID tag vs FORMAT/ID tag + bcftools query -f 'columnID=%ID infoID=%INFO/ID [fmtID=%ID ] [columnID=%/ID]' + [[reheader]] === bcftools reheader ['OPTIONS'] 'file.vcf.gz' diff --git a/test/query.3.1.out b/test/query.3.1.out new file mode 100644 index 00000000..5bf12ddc --- /dev/null +++ b/test/query.3.1.out @@ -0,0 +1 @@ +1 1000 ID G A 999 FILTER infoCHROM infoPOS infoID infoREF infoALT infoQUAL infoFILTER diff --git a/test/query.3.2.out b/test/query.3.2.out new file mode 100644 index 00000000..b6d749f5 --- /dev/null +++ b/test/query.3.2.out @@ -0,0 +1 @@ + fmtCHROM1 fmtCHROM2 fmtPOS1 fmtPOS2 fmtID1 fmtID2 fmtREF1 fmtREF2 fmtALT1 fmtALT2 fmtQUAL1 fmtQUAL2 fmtFILTER1 fmtFILTER2 diff --git a/test/query.3.3.out b/test/query.3.3.out new file mode 100644 index 00000000..185fe18d --- /dev/null +++ b/test/query.3.3.out @@ -0,0 +1 @@ + 1 1 1000 1000 ID ID G G A A 999 999 FILTER FILTER diff --git a/test/query.3.vcf b/test/query.3.vcf new file mode 100644 index 00000000..4de7cf76 --- /dev/null +++ b/test/query.3.vcf @@ -0,0 +1,20 @@ +##fileformat=VCFv4.2 +##reference=file:///lustre/scratch105/projects/g1k/ref/main_project/human_g1k_v37.fasta +##contig= +##FILTER= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT a b +1 1000 ID G A 999 FILTER CHROM=infoCHROM;POS=infoPOS;ID=infoID;REF=infoREF;ALT=infoALT;QUAL=infoQUAL;FILTER=infoFILTER CHROM:POS:ID:REF:ALT:QUAL:FILTER fmtCHROM1:fmtPOS1:fmtID1:fmtREF1:fmtALT1:fmtQUAL1:fmtFILTER1 fmtCHROM2:fmtPOS2:fmtID2:fmtREF2:fmtALT2:fmtQUAL2:fmtFILTER2 diff --git a/test/test.pl b/test/test.pl index 58e6a751..ba03f34c 100755 --- a/test/test.pl +++ b/test/test.pl @@ -115,6 +115,9 @@ run_test(\&test_vcf_merge,$opts,in=>['merge.gvcf.5.a','merge.gvcf.5.b'],out=>'merge.gvcf.5.1.out',args=>'--gvcf - --merge none'); run_test(\&test_vcf_merge,$opts,in=>['merge.gvcf.11.a','merge.gvcf.11.b','merge.gvcf.11.c'],out=>'merge.gvcf.11.1.out',args=>'--gvcf -'); # run_test(\&test_vcf_merge_big,$opts,in=>'merge_big.1',out=>'merge_big.1.1',nsmpl=>79000,nfiles=>79,nalts=>486,args=>''); # commented out for speed +run_test(\&test_vcf_query,$opts,in=>'query.3',out=>'query.3.1.out',args=>q[-f '%CHROM %POS %ID %REF %ALT %QUAL %FILTER \\t %INFO/CHROM %INFO/POS %INFO/ID %INFO/REF %INFO/ALT %INFO/QUAL %INFO/FILTER']); +run_test(\&test_vcf_query,$opts,in=>'query.3',out=>'query.3.2.out',args=>q[-f '[ %CHROM] \\t [ %POS] \\t [ %ID] \\t [ %REF] \\t [ %ALT] \\t [ %QUAL] \\t [ %FILTER]']); +run_test(\&test_vcf_query,$opts,in=>'query.3',out=>'query.3.3.out',args=>q[-f '[ %/CHROM] \\t [ %/POS] \\t [ %/ID] \\t [ %/REF] \\t [ %/ALT] \\t [ %/QUAL] \\t [ %/FILTER]']); run_test(\&test_vcf_query,$opts,in=>'query.filter.14',out=>'query.filter.14.1.out',args=>q[-f '%CHROM:%POS [ %SAMPLE %GT]']); run_test(\&test_vcf_query,$opts,in=>'query.filter.14',out=>'query.filter.14.2.out',args=>q[-f '%CHROM:%POS [ %SAMPLE %GT]' -i'GT="."']); run_test(\&test_vcf_query,$opts,in=>'query.filter.14',out=>'query.filter.14.3.out',args=>q[-f '%CHROM:%POS [ %SAMPLE %GT]' -i'GT="0|1"']);