Skip to content

Commit 283023b

Browse files
committed
Fix a bug for incorrectly formatted gVCF files
When the gVCF contains overlapping blocks, they would trigger an infinite loop in the program and it would never finish Resolves #2410
1 parent f310e99 commit 283023b

File tree

6 files changed

+49
-8
lines changed

6 files changed

+49
-8
lines changed

NEWS

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,9 @@ Changes affecting specific commands:
6767
- Do not merge symbolic alleles unless they have not just the same type, eg. <DEL>,
6868
but also length, i.e the INFO/END coordinate (#2362)
6969

70+
- Fix a bug where an incorrectly formatted gVCF file with overlapping blocks would trigger
71+
an infinite loop in the program (#2410)
72+
7073
* bcftools mpileup
7174

7275
- The -r/-R option newly merge overlapping regions, preventing the output of duplicate sites

test/merge.broken-gvcf.1.out

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
##fileformat=VCFv4.2
2+
##FILTER=<ID=PASS,Description="All filters passed">
3+
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
4+
##FORMAT=<ID=CN,Number=1,Type=Integer,Description="Copy number">
5+
##INFO=<ID=END,Number=1,Type=Integer,Description="End coordinate of the variant">
6+
##contig=<ID=chr1,length=248956422>
7+
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT SA SB
8+
chr1 80000 id1 G . 0 PASS END=90000 GT:CN 0/0:2 0/0:2
9+
chr1 80000 id2 G . 15.43 PASS . GT:CN 0/0:2 0/0:2

test/merge.broken-gvcf.a.vcf

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
##fileformat=VCFv4.2
2+
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
3+
##FORMAT=<ID=CN,Number=1,Type=Integer,Description="Copy number">
4+
##INFO=<ID=END,Number=1,Type=Integer,Description="End coordinate of the variant">
5+
##contig=<ID=chr1,length=248956422>
6+
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT SA
7+
chr1 80000 id1 G . 0 PASS END=90000 GT:CN 0/0:2
8+
chr1 80000 id2 G . 12.34 PASS END=95000 GT:CN 0/0:2

test/merge.broken-gvcf.b.vcf

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
##fileformat=VCFv4.2
2+
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
3+
##FORMAT=<ID=CN,Number=1,Type=Integer,Description="Copy number">
4+
##INFO=<ID=END,Number=1,Type=Integer,Description="End coordinate of the variant">
5+
##contig=<ID=chr1,length=248956422>
6+
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT SB
7+
chr1 80000 id1 G . 0 PASS END=90000 GT:CN 0/0:2
8+
chr1 80000 id2 G . 15.43 PASS END=95000 GT:CN 0/0:2

test/test.pl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@
6161
run_test(\&test_vcf_isec,$opts,in=>['isec-miss.1.1','isec-miss.1.2','isec-miss.1.3'],out=>'isec-miss.1.1.out',args=>'-R {PATH}/isec-miss.1.regs.txt -n +1');
6262
run_test(\&test_vcf_isec,$opts,in=>['isec-miss.2.1','isec-miss.2.2','isec-miss.2.3'],out=>'isec-miss.2.1.out',args=>'-n +1 -r 20:100,20:140,12:55,20:140,20:100');
6363
run_test(\&test_vcf_isec,$opts,in=>['isec-miss.2.1','isec-miss.2.2','isec-miss.2.3'],out=>'isec-miss.2.1.out',args=>'-R {PATH}/isec-miss.1.regs.txt -n +1');
64+
run_test(\&test_vcf_merge,$opts,in=>['merge.broken-gvcf.a','merge.broken-gvcf.b'],out=>'merge.broken-gvcf.1.out',args=>'');
6465
run_test(\&test_vcf_merge,$opts,in=>['merge.symbolic.1.a','merge.symbolic.1.b'],out=>'merge.symbolic.1.1.out',args=>'');
6566
run_test(\&test_vcf_merge,$opts,in=>['merge.multiallelics.1.a','merge.multiallelics.1.b'],out=>'merge.multiallelics.1.1.out',args=>'--merge none');
6667
run_test(\&test_vcf_merge,$opts,in=>['merge.multiallelics.1.a','merge.multiallelics.1.b'],out=>'merge.multiallelics.1.1.out',args=>'--merge both');

vcfmerge.c

Lines changed: 20 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2723,20 +2723,31 @@ void gvcf_flush(args_t *args, int done)
27232723
}
27242724
}
27252725

2726-
static inline int is_gvcf_block(bcf1_t *line)
2726+
static inline int is_gvcf_block(args_t *args, bcf1_t *line)
27272727
{
27282728
if ( line->rlen<=1 ) return 0;
27292729
if ( strlen(line->d.allele[0])==line->rlen ) return 0;
2730-
if ( line->n_allele==1 ) return 1;
2730+
if ( line->n_allele==1 ) goto is_gvcf;
27312731

27322732
int i;
27332733
for (i=1; i<line->n_allele; i++)
27342734
{
2735-
if ( !strcmp(line->d.allele[i],"<*>") ) return 1;
2736-
if ( !strcmp(line->d.allele[i],"<NON_REF>") ) return 1;
2737-
if ( !strcmp(line->d.allele[i],"<X>") ) return 1;
2735+
if ( !strcmp(line->d.allele[i],"<*>") ) goto is_gvcf;
2736+
if ( !strcmp(line->d.allele[i],"<NON_REF>") ) goto is_gvcf;
2737+
if ( !strcmp(line->d.allele[i],"<X>") ) goto is_gvcf;
27382738
}
27392739
return 0;
2740+
2741+
is_gvcf:
2742+
maux_t *ma = args->maux;
2743+
if ( !ma->gvcf )
2744+
{
2745+
args->do_gvcf = 1;
2746+
ma->gvcf = (gvcf_aux_t*) calloc(ma->n,sizeof(gvcf_aux_t)); // -Walloc-size-larger-than gives a harmless warning caused by signed integer ma->n
2747+
for (i=0; i<ma->n; i++)
2748+
ma->gvcf[i].line = bcf_init1();
2749+
}
2750+
return 1;
27402751
}
27412752

27422753
/*
@@ -2776,7 +2787,7 @@ void gvcf_stage(args_t *args, int pos)
27762787
int irec = maux->buf[i].beg;
27772788
bcf_hdr_t *hdr = bcf_sr_get_header(files, i);
27782789
bcf1_t *line = args->files->readers[i].buffer[irec];
2779-
int ret = is_gvcf_block(line) ? bcf_get_info_int32(hdr,line,"END",&end,&nend) : 0;
2790+
int ret = is_gvcf_block(args,line) ? bcf_get_info_int32(hdr,line,"END",&end,&nend) : 0;
27802791
if ( ret==1 )
27812792
{
27822793
if ( end[0] == line->pos + 1 ) // POS and INFO/END are identical, treat as if a normal w/o INFO/END
@@ -3133,7 +3144,7 @@ int can_merge(args_t *args)
31333144
var_type &= ~VCF_INDEL;
31343145
}
31353146
var_type = var_type ? var_type<<1 : ref_mask;
3136-
if ( args->do_gvcf && is_gvcf_block(line) ) var_type |= ref_mask;
3147+
if ( args->do_gvcf && is_gvcf_block(args,line) ) var_type |= ref_mask;
31373148
buf->rec[j].var_types = var_type;
31383149
}
31393150
maux->var_types |= buf->rec[j].var_types;
@@ -3233,7 +3244,8 @@ void stage_line(args_t *args)
32333244
if ( buf->rec[j].skip )
32343245
{
32353246
int is_gvcf = maux->gvcf && maux->gvcf[i].active ? 1 : 0;
3236-
if ( !is_gvcf && is_gvcf_block(buf->lines[j]) ) is_gvcf = 1;
3247+
if ( !is_gvcf && is_gvcf_block(args,buf->lines[j]) ) is_gvcf = 1;
3248+
if ( is_gvcf && buf->rec[j].skip && !maux->gvcf[i].active ) continue;
32373249
if ( !is_gvcf ) continue; // done or not compatible
32383250
}
32393251
if ( args->merge_by_id ) break; // if merging by ID and the line is compatible, the this is THE line

0 commit comments

Comments
 (0)