diff --git a/README.rst b/README.rst
index 68f2fd55..9fa28cd3 100644
--- a/README.rst
+++ b/README.rst
@@ -25,7 +25,7 @@ as it resolves non-python dependencies and uses pre-configured
compilation options. Especially for OS X this will potentially save a
lot of trouble.
-The current version of pysam wraps 3rd-party code from htslib-1.21, samtools-1.21, and bcftools-1.21.
+The current version of pysam wraps 3rd-party code from htslib-1.22, samtools-1.22, and bcftools-1.22.
Pysam is available through `PyPI `_.
To install, type::
diff --git a/bcftools/bcftools.h b/bcftools/bcftools.h
index 51c2d040..5a4071d9 100644
--- a/bcftools/bcftools.h
+++ b/bcftools/bcftools.h
@@ -50,6 +50,9 @@ void error(const char *format, ...) HTS_NORETURN HTS_FORMAT(HTS_PRINTF_FMT, 1, 2
// newline will be added by the function.
void error_errno(const char *format, ...) HTS_NORETURN HTS_FORMAT(HTS_PRINTF_FMT, 1, 2);
+// Set hts_verbose and return 0, or return -1 if str is not a valid integer
+int apply_verbosity(const char *str);
+
// For on the fly index creation with --write-index
int init_index2(htsFile *fh, bcf_hdr_t *hdr, const char *fname, char **idx_fname, int idx_fmt);
int init_index(htsFile *fh, bcf_hdr_t *hdr, const char *fname, char **idx_fname);
diff --git a/bcftools/consensus.c b/bcftools/consensus.c
index 54f17c22..c3344206 100644
--- a/bcftools/consensus.c
+++ b/bcftools/consensus.c
@@ -1,6 +1,6 @@
/* The MIT License
- Copyright (c) 2014-2024 Genome Research Ltd.
+ Copyright (c) 2014-2025 Genome Research Ltd.
Author: Petr Danecek
@@ -228,24 +228,24 @@ static void init_data(args_t *args)
if ( !bcf_sr_add_reader(args->files,args->fname) ) error("Failed to read from %s: %s\n", !strcmp("-",args->fname)?"standard input":args->fname, bcf_sr_strerror(args->files->errnum));
args->hdr = args->files->readers[0].header;
args->isample = -1;
- if ( !args->sample )
+ if ( args->sample_fname )
{
- args->smpl = smpl_ilist_init(args->hdr,NULL,0,SMPL_NONE|SMPL_VERBOSE);
- if ( !args->smpl->n )
- {
- smpl_ilist_destroy(args->smpl);
- args->smpl = NULL;
- }
+ args->smpl = smpl_ilist_init(args->hdr,args->sample_fname,1,SMPL_NONE|SMPL_VERBOSE);
+ if ( args->smpl && !args->smpl->n ) error("No matching sample found\n");
}
else if ( args->sample && strcmp("-",args->sample) )
{
args->smpl = smpl_ilist_init(args->hdr,args->sample,0,SMPL_NONE|SMPL_VERBOSE);
if ( args->smpl && !args->smpl->n ) error("No matching sample found\n");
}
- else if ( args->sample_fname )
+ else if ( !args->sample )
{
- args->smpl = smpl_ilist_init(args->hdr,args->sample_fname,1,SMPL_NONE|SMPL_VERBOSE);
- if ( args->smpl && !args->smpl->n ) error("No matching sample found\n");
+ args->smpl = smpl_ilist_init(args->hdr,NULL,0,SMPL_NONE|SMPL_VERBOSE);
+ if ( !args->smpl->n )
+ {
+ smpl_ilist_destroy(args->smpl);
+ args->smpl = NULL;
+ }
}
if ( args->smpl )
{
@@ -768,12 +768,26 @@ static void apply_variant(args_t *args, bcf1_t *rec)
}
if ( ialt==-1 )
{
- char alleles[4];
- alleles[0] = rec->d.allele[0][0];
- alleles[1] = ',';
- alleles[2] = args->missing_allele;
- alleles[3] = 0;
- bcf_update_alleles_str(args->hdr, rec, alleles);
+ // missing allele, it can be a single position or an entire gvcf block
+ if ( rec->rlen>1 && bcf_has_variant_types(rec,VCF_REF,bcf_match_exact)>0 )
+ {
+ kstring_t str = {0,0,0};
+ int idx = rec->pos - args->fa_ori_pos + args->fa_mod_off; // position of the variant within the modified fasta sequence
+ kputsn(args->fa_buf.s+idx,rec->rlen, &str);
+ kputc(',', &str);
+ for (i=0; irlen; i++) kputc(args->missing_allele, &str);
+ bcf_update_alleles_str(args->hdr, rec, str.s);
+ free(str.s);
+ }
+ else
+ {
+ char alleles[4];
+ alleles[0] = rec->d.allele[0][0];
+ alleles[1] = ',';
+ alleles[2] = args->missing_allele;
+ alleles[3] = 0;
+ bcf_update_alleles_str(args->hdr, rec, alleles);
+ }
ialt = 1;
}
@@ -1203,6 +1217,7 @@ static void usage(args_t *args)
fprintf(stderr, " --regions-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1]\n");
fprintf(stderr, " -s, --samples LIST Comma-separated list of samples to include, \"-\" to ignore samples and use REF,ALT\n");
fprintf(stderr, " -S, --samples-file FILE File of samples to include\n");
+ fprintf(stderr, " -v, --verbosity INT Verbosity level\n");
fprintf(stderr, "Examples:\n");
fprintf(stderr, " # Get the consensus for one region. The fasta header lines are then expected\n");
fprintf(stderr, " # in the form \">chr:from-to\".\n");
@@ -1240,13 +1255,17 @@ int main_consensus(int argc, char *argv[])
{"chain",1,0,'c'},
{"prefix",required_argument,0,'p'},
{"regions-overlap",required_argument,0,5},
+ {"verbosity",required_argument,NULL,'v'},
{0,0,0,0}
};
int c;
- while ((c = getopt_long(argc, argv, "h?s:S:1Ii:e:H:f:o:m:c:M:p:a:",loptions,NULL)) >= 0)
+ while ((c = getopt_long(argc, argv, "h?s:S:1Ii:e:H:f:o:m:c:M:p:a:v:",loptions,NULL)) >= 0)
{
switch (c)
{
+ case 'v':
+ if ( apply_verbosity(optarg) < 0 ) error("Could not parse argument: --verbosity %s\n", optarg);
+ break;
case 1 : args->mark_del = optarg[0]; break;
case 2 :
if ( !strcasecmp(optarg,"uc") ) args->mark_ins = TO_UPPER;
diff --git a/bcftools/consensus.c.pysam.c b/bcftools/consensus.c.pysam.c
index a004f004..8b3eb4b1 100644
--- a/bcftools/consensus.c.pysam.c
+++ b/bcftools/consensus.c.pysam.c
@@ -2,7 +2,7 @@
/* The MIT License
- Copyright (c) 2014-2024 Genome Research Ltd.
+ Copyright (c) 2014-2025 Genome Research Ltd.
Author: Petr Danecek
@@ -230,24 +230,24 @@ static void init_data(args_t *args)
if ( !bcf_sr_add_reader(args->files,args->fname) ) error("Failed to read from %s: %s\n", !strcmp("-",args->fname)?"standard input":args->fname, bcf_sr_strerror(args->files->errnum));
args->hdr = args->files->readers[0].header;
args->isample = -1;
- if ( !args->sample )
+ if ( args->sample_fname )
{
- args->smpl = smpl_ilist_init(args->hdr,NULL,0,SMPL_NONE|SMPL_VERBOSE);
- if ( !args->smpl->n )
- {
- smpl_ilist_destroy(args->smpl);
- args->smpl = NULL;
- }
+ args->smpl = smpl_ilist_init(args->hdr,args->sample_fname,1,SMPL_NONE|SMPL_VERBOSE);
+ if ( args->smpl && !args->smpl->n ) error("No matching sample found\n");
}
else if ( args->sample && strcmp("-",args->sample) )
{
args->smpl = smpl_ilist_init(args->hdr,args->sample,0,SMPL_NONE|SMPL_VERBOSE);
if ( args->smpl && !args->smpl->n ) error("No matching sample found\n");
}
- else if ( args->sample_fname )
+ else if ( !args->sample )
{
- args->smpl = smpl_ilist_init(args->hdr,args->sample_fname,1,SMPL_NONE|SMPL_VERBOSE);
- if ( args->smpl && !args->smpl->n ) error("No matching sample found\n");
+ args->smpl = smpl_ilist_init(args->hdr,NULL,0,SMPL_NONE|SMPL_VERBOSE);
+ if ( !args->smpl->n )
+ {
+ smpl_ilist_destroy(args->smpl);
+ args->smpl = NULL;
+ }
}
if ( args->smpl )
{
@@ -770,12 +770,26 @@ static void apply_variant(args_t *args, bcf1_t *rec)
}
if ( ialt==-1 )
{
- char alleles[4];
- alleles[0] = rec->d.allele[0][0];
- alleles[1] = ',';
- alleles[2] = args->missing_allele;
- alleles[3] = 0;
- bcf_update_alleles_str(args->hdr, rec, alleles);
+ // missing allele, it can be a single position or an entire gvcf block
+ if ( rec->rlen>1 && bcf_has_variant_types(rec,VCF_REF,bcf_match_exact)>0 )
+ {
+ kstring_t str = {0,0,0};
+ int idx = rec->pos - args->fa_ori_pos + args->fa_mod_off; // position of the variant within the modified fasta sequence
+ kputsn(args->fa_buf.s+idx,rec->rlen, &str);
+ kputc(',', &str);
+ for (i=0; irlen; i++) kputc(args->missing_allele, &str);
+ bcf_update_alleles_str(args->hdr, rec, str.s);
+ free(str.s);
+ }
+ else
+ {
+ char alleles[4];
+ alleles[0] = rec->d.allele[0][0];
+ alleles[1] = ',';
+ alleles[2] = args->missing_allele;
+ alleles[3] = 0;
+ bcf_update_alleles_str(args->hdr, rec, alleles);
+ }
ialt = 1;
}
@@ -1205,6 +1219,7 @@ static void usage(args_t *args)
fprintf(bcftools_stderr, " --regions-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1]\n");
fprintf(bcftools_stderr, " -s, --samples LIST Comma-separated list of samples to include, \"-\" to ignore samples and use REF,ALT\n");
fprintf(bcftools_stderr, " -S, --samples-file FILE File of samples to include\n");
+ fprintf(bcftools_stderr, " -v, --verbosity INT Verbosity level\n");
fprintf(bcftools_stderr, "Examples:\n");
fprintf(bcftools_stderr, " # Get the consensus for one region. The fasta header lines are then expected\n");
fprintf(bcftools_stderr, " # in the form \">chr:from-to\".\n");
@@ -1242,13 +1257,17 @@ int main_consensus(int argc, char *argv[])
{"chain",1,0,'c'},
{"prefix",required_argument,0,'p'},
{"regions-overlap",required_argument,0,5},
+ {"verbosity",required_argument,NULL,'v'},
{0,0,0,0}
};
int c;
- while ((c = getopt_long(argc, argv, "h?s:S:1Ii:e:H:f:o:m:c:M:p:a:",loptions,NULL)) >= 0)
+ while ((c = getopt_long(argc, argv, "h?s:S:1Ii:e:H:f:o:m:c:M:p:a:v:",loptions,NULL)) >= 0)
{
switch (c)
{
+ case 'v':
+ if ( apply_verbosity(optarg) < 0 ) error("Could not parse argument: --verbosity %s\n", optarg);
+ break;
case 1 : args->mark_del = optarg[0]; break;
case 2 :
if ( !strcasecmp(optarg,"uc") ) args->mark_ins = TO_UPPER;
diff --git a/bcftools/convert.c b/bcftools/convert.c
index c459c838..5ab39562 100644
--- a/bcftools/convert.c
+++ b/bcftools/convert.c
@@ -1,6 +1,6 @@
/* convert.c -- functions for converting between VCF/BCF and related formats.
- Copyright (C) 2013-2024 Genome Research Ltd.
+ Copyright (C) 2013-2025 Genome Research Ltd.
Author: Petr Danecek
@@ -28,6 +28,7 @@ THE SOFTWARE. */
#include
#include
#include
+#include
#include
#include
#include
@@ -79,6 +80,7 @@ THE SOFTWARE. */
#define T_VKX 31 // VARIANTKEY HEX
#define T_PBINOM 32
#define T_NPASS 33
+#define T_FILTER_EXPR 34 // print the results of -i/-e functions via query
typedef struct _fmt_t
{
@@ -123,6 +125,16 @@ typedef struct
}
bcsq_t;
+typedef struct
+{
+ filter_t *filter;
+ int nval;
+ double *val;
+}
+filter_expr_t;
+
+static fmt_t *register_tag(convert_t *convert, char *key, int is_gtf, int type);
+
static void process_chrom(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str) { kputs(convert->header->id[BCF_DT_CTG][line->rid].key, str); }
static void process_pos(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str) { kputw(line->pos+1, str); }
static void process_pos0(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str) { kputw(line->pos, str); }
@@ -1157,6 +1169,50 @@ static void destroy_npass(void *usr)
{
filter_destroy((filter_t*)usr);
}
+static void process_filter_expr(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str)
+{
+ filter_expr_t *dat = (filter_expr_t*) fmt->usr;
+
+ int i, nval, nval1;
+ const double *val;
+ if ( fmt->is_gt_field )
+ {
+ if ( !fmt->ready )
+ {
+ filter_test(dat->filter,line,NULL);
+ val = filter_get_doubles(dat->filter,&nval,&nval1);
+ if ( fmt->is_gt_field )
+ {
+ if ( !dat->nval )
+ {
+ dat->nval = nval;
+ dat->val = malloc(nval*sizeof(double));
+ if ( !dat->val ) error("Error: failed to allocate %zu bytes\n",nval*sizeof(double));
+ }
+ assert( dat->nval==nval );
+ for (i=0; ival[i] = val[i];
+ }
+ fmt->ready = 1;
+ }
+ val = dat->val;
+ nval = dat->nval;
+ }
+ else
+ {
+ filter_test(dat->filter,line,NULL);
+ val = filter_get_doubles(dat->filter,&nval,&nval1);
+ }
+ if ( isample<0 ) isample = 0;
+ if ( isample>=nval ) isample = 0;
+ kputd(val[isample], str);
+}
+static void destroy_filter_expr(void *usr)
+{
+ filter_expr_t *dat = (filter_expr_t*) usr;
+ filter_destroy(dat->filter);
+ free(dat->val);
+ free(dat);
+}
static void process_pbinom(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str)
{
@@ -1249,6 +1305,48 @@ static void _used_tags_add(convert_t *convert, int type, char *key)
else if ( !strcmp("MASK",key) ) { function(__VA_ARGS__, T_MASK); } \
else if ( !strcmp("LINE",key) ) { function(__VA_ARGS__, T_LINE); }
+// This invokes the functionality of -i/-e expressions
+static char *set_filter_expr(convert_t *convert, char *key, int is_gtf)
+{
+ kstring_t str = {0,0,0};
+ char *ptr = key;
+ while ( *ptr && *ptr!=')' ) ptr++;
+ if ( !*ptr ) error("Could not parse format string: %s\n",convert->format_str);
+ kputsn(key, ptr-key+1, &str);
+ register_tag(convert, str.s, is_gtf, T_FILTER_EXPR);
+ free(str.s);
+ return key+str.l;
+}
+
+// These are the -i/-e functions made to be printed via `query -f`
+#define _SET_FILTER_EXPR(convert,function,key,ptr,is_gtf) \
+ if ( !strncasecmp(key,"MAX(",4) ) { ptr = function(convert,key,is_gtf); } \
+ else if ( !strncasecmp(key,"MIN(",4) ) { ptr = function(convert,key,is_gtf); } \
+ else if ( !strncasecmp(key,"MEAN(",5) ) { ptr = function(convert,key,is_gtf); } \
+ else if ( !strncasecmp(key,"MEDIAN(",7) ) { ptr = function(convert,key,is_gtf); } \
+ else if ( !strncasecmp(key,"AVG(",4) ) { ptr = function(convert,key,is_gtf); } \
+ else if ( !strncasecmp(key,"SUM(",4) ) { ptr = function(convert,key,is_gtf); } \
+ else if ( !strncasecmp(key,"ABS(",4) ) { ptr = function(convert,key,is_gtf); } \
+ else if ( !strncasecmp(key,"COUNT(",6) ) { ptr = function(convert,key,is_gtf); } \
+ else if ( !strncasecmp(key,"STDEV(",6) ) { ptr = function(convert,key,is_gtf); } \
+ else if ( !strncasecmp(key,"STRLEN(",7) ) { ptr = function(convert,key,is_gtf); } \
+ else if ( !strncasecmp(key,"BINOM(",6) ) { ptr = function(convert,key,is_gtf); } \
+ else if ( !strncasecmp(key,"PHRED(",6) ) { ptr = function(convert,key,is_gtf); } \
+ else if ( !strncasecmp(key,"SMPL_MAX(",9) ) { ptr = function(convert,key,is_gtf); } \
+ else if ( !strncasecmp(key,"SMPL_MIN(",9) ) { ptr = function(convert,key,is_gtf); } \
+ else if ( !strncasecmp(key,"SMPL_MEAN(",10) ) { ptr = function(convert,key,is_gtf); } \
+ else if ( !strncasecmp(key,"SMPL_MEDIAN(",12) ) { ptr = function(convert,key,is_gtf); } \
+ else if ( !strncasecmp(key,"SMPL_AVG(",9) ) { ptr = function(convert,key,is_gtf); } \
+ else if ( !strncasecmp(key,"SMPL_STDEV(",11) ) { ptr = function(convert,key,is_gtf); } \
+ else if ( !strncasecmp(key,"SMPL_SUM(",9) ) { ptr = function(convert,key,is_gtf); } \
+ else if ( !strncasecmp(key,"sMAX(",5) ) { ptr = function(convert,key,is_gtf); } \
+ else if ( !strncasecmp(key,"sMIN(",5) ) { ptr = function(convert,key,is_gtf); } \
+ else if ( !strncasecmp(key,"sMEAN(",6) ) { ptr = function(convert,key,is_gtf); } \
+ else if ( !strncasecmp(key,"sMEDIAN(",8) ) { ptr = function(convert,key,is_gtf); } \
+ else if ( !strncasecmp(key,"sAVG(",5) ) { ptr = function(convert,key,is_gtf); } \
+ else if ( !strncasecmp(key,"sSTDEV(",7) ) { ptr = function(convert,key,is_gtf); } \
+ else if ( !strncasecmp(key,"sSUM(",5) ) { ptr = function(convert,key,is_gtf); }
+
static void set_type(fmt_t *fmt, int type) { fmt->type = type; }
static fmt_t *register_tag(convert_t *convert, char *key, int is_gtf, int type)
{
@@ -1273,8 +1371,8 @@ static fmt_t *register_tag(convert_t *convert, char *key, int is_gtf, int type)
if ( fmt->type==T_FORMAT && !bcf_hdr_idinfo_exists(convert->header,BCF_HL_FMT,id) )
{
_SET_NON_FORMAT_TAGS(set_type,key,fmt)
- else if ( !strcmp("ALT",key) ) { fmt->type = T_ALT; }
- else if ( !strcmp("_CHROM_POS_ID",key) ) { fmt->type = T_CHROM_POS_ID; }
+ else if ( !strcmp("ALT",key) ) { fmt->type = T_ALT; }
+ else if ( !strcmp("_CHROM_POS_ID",key) ) { fmt->type = T_CHROM_POS_ID; }
else if ( !strcmp("RSX",key) ) { fmt->type = T_RSX; }
else if ( !strcmp("VKX",key) ) { fmt->type = T_VKX; }
else if ( id>=0 && bcf_hdr_idinfo_exists(convert->header,BCF_HL_INFO,id) )
@@ -1295,6 +1393,14 @@ static fmt_t *register_tag(convert_t *convert, char *key, int is_gtf, int type)
convert->max_unpack |= filter_max_unpack(flt);
fmt->usr = (void*) flt;
}
+ else if ( fmt->type==T_FILTER_EXPR )
+ {
+ filter_t *filter = filter_init(convert->header,key);
+ convert->max_unpack |= filter_max_unpack(filter);
+ filter_expr_t *dat = calloc(1,sizeof(filter_expr_t));
+ fmt->usr = dat;
+ dat->filter = filter;
+ }
}
switch (fmt->type)
@@ -1332,6 +1438,7 @@ static fmt_t *register_tag(convert_t *convert, char *key, int is_gtf, int type)
case T_VKX: fmt->handler = &process_variantkey_hex; break;
case T_PBINOM: fmt->handler = &process_pbinom; convert->max_unpack |= BCF_UN_FMT; break;
case T_NPASS: fmt->handler = &process_npass; fmt->destroy = &destroy_npass; break;
+ case T_FILTER_EXPR: fmt->handler = &process_filter_expr; fmt->destroy = &destroy_filter_expr; break;
default: error("TODO: handler for type %d\n", fmt->type);
}
if ( key && fmt->type==T_INFO )
@@ -1360,14 +1467,28 @@ static int parse_subscript(char **p)
static char *parse_tag(convert_t *convert, char *p, int is_gtf)
{
+ int is_vcf_column = p[1]=='/' ? 1 : 0;
+ if ( is_vcf_column ) p++;
+
char *q = ++p;
while ( *q && (isalnum(*q) || *q=='_' || *q=='.') ) q++;
kstring_t str = {0,0,0};
if ( q-p==0 ) error("Could not parse format string: %s\n", convert->format_str);
kputsn(p, q-p, &str);
- if ( is_gtf )
+ if ( is_gtf && is_vcf_column )
+ {
+ _SET_NON_FORMAT_TAGS(register_tag, str.s, convert, str.s, is_gtf)
+ else if ( !strcmp(str.s, "ALT") )
+ {
+ fmt_t *fmt = register_tag(convert, str.s, is_gtf, T_ALT);
+ fmt->subscript = parse_subscript(&q);
+ }
+ else error("Could not parse tag: %s .. %s\n", str.s,convert->format_str);
+ }
+ else if ( is_gtf )
{
- if ( !strcmp(str.s, "SAMPLE") ) register_tag(convert, "SAMPLE", is_gtf, T_SAMPLE);
+ _SET_FILTER_EXPR(convert,set_filter_expr,p,q,1)
+ else if ( !strcmp(str.s, "SAMPLE") ) register_tag(convert, "SAMPLE", is_gtf, T_SAMPLE);
else if ( !strcmp(str.s, "GT") ) register_tag(convert, "GT", is_gtf, T_GT);
else if ( !strcmp(str.s, "TGT") ) register_tag(convert, "GT", is_gtf, T_TGT);
else if ( !strcmp(str.s, "TBCSQ") )
@@ -1422,6 +1543,7 @@ static char *parse_tag(convert_t *convert, char *p, int is_gtf)
else
{
_SET_NON_FORMAT_TAGS(register_tag, str.s, convert, str.s, is_gtf)
+ else _SET_FILTER_EXPR(convert,set_filter_expr,p,q,0)
else if ( !strcmp(str.s, "ALT") )
{
fmt_t *fmt = register_tag(convert, str.s, is_gtf, T_ALT);
diff --git a/bcftools/convert.c.pysam.c b/bcftools/convert.c.pysam.c
index e7d2905c..0b23f371 100644
--- a/bcftools/convert.c.pysam.c
+++ b/bcftools/convert.c.pysam.c
@@ -2,7 +2,7 @@
/* convert.c -- functions for converting between VCF/BCF and related formats.
- Copyright (C) 2013-2024 Genome Research Ltd.
+ Copyright (C) 2013-2025 Genome Research Ltd.
Author: Petr Danecek
@@ -30,6 +30,7 @@ THE SOFTWARE. */
#include
#include
#include
+#include
#include
#include
#include
@@ -81,6 +82,7 @@ THE SOFTWARE. */
#define T_VKX 31 // VARIANTKEY HEX
#define T_PBINOM 32
#define T_NPASS 33
+#define T_FILTER_EXPR 34 // print the results of -i/-e functions via query
typedef struct _fmt_t
{
@@ -125,6 +127,16 @@ typedef struct
}
bcsq_t;
+typedef struct
+{
+ filter_t *filter;
+ int nval;
+ double *val;
+}
+filter_expr_t;
+
+static fmt_t *register_tag(convert_t *convert, char *key, int is_gtf, int type);
+
static void process_chrom(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str) { kputs(convert->header->id[BCF_DT_CTG][line->rid].key, str); }
static void process_pos(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str) { kputw(line->pos+1, str); }
static void process_pos0(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str) { kputw(line->pos, str); }
@@ -1159,6 +1171,50 @@ static void destroy_npass(void *usr)
{
filter_destroy((filter_t*)usr);
}
+static void process_filter_expr(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str)
+{
+ filter_expr_t *dat = (filter_expr_t*) fmt->usr;
+
+ int i, nval, nval1;
+ const double *val;
+ if ( fmt->is_gt_field )
+ {
+ if ( !fmt->ready )
+ {
+ filter_test(dat->filter,line,NULL);
+ val = filter_get_doubles(dat->filter,&nval,&nval1);
+ if ( fmt->is_gt_field )
+ {
+ if ( !dat->nval )
+ {
+ dat->nval = nval;
+ dat->val = malloc(nval*sizeof(double));
+ if ( !dat->val ) error("Error: failed to allocate %zu bytes\n",nval*sizeof(double));
+ }
+ assert( dat->nval==nval );
+ for (i=0; ival[i] = val[i];
+ }
+ fmt->ready = 1;
+ }
+ val = dat->val;
+ nval = dat->nval;
+ }
+ else
+ {
+ filter_test(dat->filter,line,NULL);
+ val = filter_get_doubles(dat->filter,&nval,&nval1);
+ }
+ if ( isample<0 ) isample = 0;
+ if ( isample>=nval ) isample = 0;
+ kputd(val[isample], str);
+}
+static void destroy_filter_expr(void *usr)
+{
+ filter_expr_t *dat = (filter_expr_t*) usr;
+ filter_destroy(dat->filter);
+ free(dat->val);
+ free(dat);
+}
static void process_pbinom(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str)
{
@@ -1251,6 +1307,48 @@ static void _used_tags_add(convert_t *convert, int type, char *key)
else if ( !strcmp("MASK",key) ) { function(__VA_ARGS__, T_MASK); } \
else if ( !strcmp("LINE",key) ) { function(__VA_ARGS__, T_LINE); }
+// This invokes the functionality of -i/-e expressions
+static char *set_filter_expr(convert_t *convert, char *key, int is_gtf)
+{
+ kstring_t str = {0,0,0};
+ char *ptr = key;
+ while ( *ptr && *ptr!=')' ) ptr++;
+ if ( !*ptr ) error("Could not parse format string: %s\n",convert->format_str);
+ kputsn(key, ptr-key+1, &str);
+ register_tag(convert, str.s, is_gtf, T_FILTER_EXPR);
+ free(str.s);
+ return key+str.l;
+}
+
+// These are the -i/-e functions made to be printed via `query -f`
+#define _SET_FILTER_EXPR(convert,function,key,ptr,is_gtf) \
+ if ( !strncasecmp(key,"MAX(",4) ) { ptr = function(convert,key,is_gtf); } \
+ else if ( !strncasecmp(key,"MIN(",4) ) { ptr = function(convert,key,is_gtf); } \
+ else if ( !strncasecmp(key,"MEAN(",5) ) { ptr = function(convert,key,is_gtf); } \
+ else if ( !strncasecmp(key,"MEDIAN(",7) ) { ptr = function(convert,key,is_gtf); } \
+ else if ( !strncasecmp(key,"AVG(",4) ) { ptr = function(convert,key,is_gtf); } \
+ else if ( !strncasecmp(key,"SUM(",4) ) { ptr = function(convert,key,is_gtf); } \
+ else if ( !strncasecmp(key,"ABS(",4) ) { ptr = function(convert,key,is_gtf); } \
+ else if ( !strncasecmp(key,"COUNT(",6) ) { ptr = function(convert,key,is_gtf); } \
+ else if ( !strncasecmp(key,"STDEV(",6) ) { ptr = function(convert,key,is_gtf); } \
+ else if ( !strncasecmp(key,"STRLEN(",7) ) { ptr = function(convert,key,is_gtf); } \
+ else if ( !strncasecmp(key,"BINOM(",6) ) { ptr = function(convert,key,is_gtf); } \
+ else if ( !strncasecmp(key,"PHRED(",6) ) { ptr = function(convert,key,is_gtf); } \
+ else if ( !strncasecmp(key,"SMPL_MAX(",9) ) { ptr = function(convert,key,is_gtf); } \
+ else if ( !strncasecmp(key,"SMPL_MIN(",9) ) { ptr = function(convert,key,is_gtf); } \
+ else if ( !strncasecmp(key,"SMPL_MEAN(",10) ) { ptr = function(convert,key,is_gtf); } \
+ else if ( !strncasecmp(key,"SMPL_MEDIAN(",12) ) { ptr = function(convert,key,is_gtf); } \
+ else if ( !strncasecmp(key,"SMPL_AVG(",9) ) { ptr = function(convert,key,is_gtf); } \
+ else if ( !strncasecmp(key,"SMPL_STDEV(",11) ) { ptr = function(convert,key,is_gtf); } \
+ else if ( !strncasecmp(key,"SMPL_SUM(",9) ) { ptr = function(convert,key,is_gtf); } \
+ else if ( !strncasecmp(key,"sMAX(",5) ) { ptr = function(convert,key,is_gtf); } \
+ else if ( !strncasecmp(key,"sMIN(",5) ) { ptr = function(convert,key,is_gtf); } \
+ else if ( !strncasecmp(key,"sMEAN(",6) ) { ptr = function(convert,key,is_gtf); } \
+ else if ( !strncasecmp(key,"sMEDIAN(",8) ) { ptr = function(convert,key,is_gtf); } \
+ else if ( !strncasecmp(key,"sAVG(",5) ) { ptr = function(convert,key,is_gtf); } \
+ else if ( !strncasecmp(key,"sSTDEV(",7) ) { ptr = function(convert,key,is_gtf); } \
+ else if ( !strncasecmp(key,"sSUM(",5) ) { ptr = function(convert,key,is_gtf); }
+
static void set_type(fmt_t *fmt, int type) { fmt->type = type; }
static fmt_t *register_tag(convert_t *convert, char *key, int is_gtf, int type)
{
@@ -1275,8 +1373,8 @@ static fmt_t *register_tag(convert_t *convert, char *key, int is_gtf, int type)
if ( fmt->type==T_FORMAT && !bcf_hdr_idinfo_exists(convert->header,BCF_HL_FMT,id) )
{
_SET_NON_FORMAT_TAGS(set_type,key,fmt)
- else if ( !strcmp("ALT",key) ) { fmt->type = T_ALT; }
- else if ( !strcmp("_CHROM_POS_ID",key) ) { fmt->type = T_CHROM_POS_ID; }
+ else if ( !strcmp("ALT",key) ) { fmt->type = T_ALT; }
+ else if ( !strcmp("_CHROM_POS_ID",key) ) { fmt->type = T_CHROM_POS_ID; }
else if ( !strcmp("RSX",key) ) { fmt->type = T_RSX; }
else if ( !strcmp("VKX",key) ) { fmt->type = T_VKX; }
else if ( id>=0 && bcf_hdr_idinfo_exists(convert->header,BCF_HL_INFO,id) )
@@ -1297,6 +1395,14 @@ static fmt_t *register_tag(convert_t *convert, char *key, int is_gtf, int type)
convert->max_unpack |= filter_max_unpack(flt);
fmt->usr = (void*) flt;
}
+ else if ( fmt->type==T_FILTER_EXPR )
+ {
+ filter_t *filter = filter_init(convert->header,key);
+ convert->max_unpack |= filter_max_unpack(filter);
+ filter_expr_t *dat = calloc(1,sizeof(filter_expr_t));
+ fmt->usr = dat;
+ dat->filter = filter;
+ }
}
switch (fmt->type)
@@ -1334,6 +1440,7 @@ static fmt_t *register_tag(convert_t *convert, char *key, int is_gtf, int type)
case T_VKX: fmt->handler = &process_variantkey_hex; break;
case T_PBINOM: fmt->handler = &process_pbinom; convert->max_unpack |= BCF_UN_FMT; break;
case T_NPASS: fmt->handler = &process_npass; fmt->destroy = &destroy_npass; break;
+ case T_FILTER_EXPR: fmt->handler = &process_filter_expr; fmt->destroy = &destroy_filter_expr; break;
default: error("TODO: handler for type %d\n", fmt->type);
}
if ( key && fmt->type==T_INFO )
@@ -1362,14 +1469,28 @@ static int parse_subscript(char **p)
static char *parse_tag(convert_t *convert, char *p, int is_gtf)
{
+ int is_vcf_column = p[1]=='/' ? 1 : 0;
+ if ( is_vcf_column ) p++;
+
char *q = ++p;
while ( *q && (isalnum(*q) || *q=='_' || *q=='.') ) q++;
kstring_t str = {0,0,0};
if ( q-p==0 ) error("Could not parse format string: %s\n", convert->format_str);
kputsn(p, q-p, &str);
- if ( is_gtf )
+ if ( is_gtf && is_vcf_column )
+ {
+ _SET_NON_FORMAT_TAGS(register_tag, str.s, convert, str.s, is_gtf)
+ else if ( !strcmp(str.s, "ALT") )
+ {
+ fmt_t *fmt = register_tag(convert, str.s, is_gtf, T_ALT);
+ fmt->subscript = parse_subscript(&q);
+ }
+ else error("Could not parse tag: %s .. %s\n", str.s,convert->format_str);
+ }
+ else if ( is_gtf )
{
- if ( !strcmp(str.s, "SAMPLE") ) register_tag(convert, "SAMPLE", is_gtf, T_SAMPLE);
+ _SET_FILTER_EXPR(convert,set_filter_expr,p,q,1)
+ else if ( !strcmp(str.s, "SAMPLE") ) register_tag(convert, "SAMPLE", is_gtf, T_SAMPLE);
else if ( !strcmp(str.s, "GT") ) register_tag(convert, "GT", is_gtf, T_GT);
else if ( !strcmp(str.s, "TGT") ) register_tag(convert, "GT", is_gtf, T_TGT);
else if ( !strcmp(str.s, "TBCSQ") )
@@ -1424,6 +1545,7 @@ static char *parse_tag(convert_t *convert, char *p, int is_gtf)
else
{
_SET_NON_FORMAT_TAGS(register_tag, str.s, convert, str.s, is_gtf)
+ else _SET_FILTER_EXPR(convert,set_filter_expr,p,q,0)
else if ( !strcmp(str.s, "ALT") )
{
fmt_t *fmt = register_tag(convert, str.s, is_gtf, T_ALT);
diff --git a/bcftools/csq.c b/bcftools/csq.c
index b38eba10..53fa4daa 100644
--- a/bcftools/csq.c
+++ b/bcftools/csq.c
@@ -1,6 +1,6 @@
/* The MIT License
- Copyright (c) 2016-2024 Genome Research Ltd.
+ Copyright (c) 2016-2025 Genome Research Ltd.
Author: Petr Danecek
@@ -218,6 +218,10 @@
#define CSQ_PRN_NMD (~(CSQ_INTRON|CSQ_NON_CODING))
#define CSQ_PRN_BIOTYPE CSQ_NON_CODING
+#define CHR_VCF 0
+#define CHR_GFF 1
+#define CHR_FAI 2
+
// see kput_vcsq()
const char *csq_strings[] =
{
@@ -367,15 +371,24 @@ typedef struct
{
int mstack;
hstack_t *stack;
- gf_tscript_t *tr; // tr->ref: spliced transcript on ref strand
- kstring_t sseq; // spliced haplotype sequence on ref strand
- kstring_t tseq; // the variable part of translated haplotype transcript, coding strand
- kstring_t tref; // the variable part of translated reference transcript, coding strand
- uint32_t sbeg; // stack's sbeg, for cases first node's type is HAP_SSS
+ gf_tscript_t *tr; // tr->ref: spliced transcript on ref strand
+ kstring_t sseq; // spliced haplotype sequence on ref strand
+ kstring_t tseq; // the variable part of translated haplotype transcript, coding strand
+ kstring_t tref; // the variable part of translated reference transcript, coding strand
+ kstring_t tseq_stop; // the stop/start codons in tseq and tref
+ kstring_t tref_stop; //
+ uint32_t sbeg; // stack's sbeg, for cases first node's type is HAP_SSS
int upstream_stop;
}
hap_t;
+typedef struct
+{
+ int id;
+ const char *name, *code, *stop;
+}
+gencode_t;
+
typedef struct _args_t
{
// the main regidx lookups, from chr:beg-end to overlapping features and
@@ -413,14 +426,18 @@ typedef struct _args_t
int ncsq2_max, nfmt_bcsq; // maximum number of csq per site that can be accessed from FORMAT/BCSQ (*2 and 1 bit skipped to avoid BCF missing values)
int ncsq2_small_warned;
int brief_predictions;
- int unify_chr_names;
- char *chr_name;
+ char *unify_chr_names; // e.g. chr,Chromosome,-; prefixes in VCF,GFF,fasta
+ char *unify_chr_names_err;
+ char *chr_prefix[3]; // chr prefix to trim in VCF,GFF,fasta. See also CHR_VCF,CHR_GFF,CHR_FAI
+ char *chr_name, *chr_names[3];
int mchr_name;
struct {
int unknown_chr,unknown_tscript_biotype,unknown_strand,unknown_phase,duplicate_id;
- int unknown_cds_phase,incomplete_cds,wrong_phase,overlapping_cds;
+ int unknown_cds_phase,incomplete_cds,wrong_phase,overlapping_cds,ref_allele_mismatch;
} warned;
+ char *gencode_str; // which genetic code table to use
+ gencode_t *gencode; // genetic code table
int rid; // current chromosome
tr_heap_t *active_tr; // heap of active transcripts for quick flushing
hap_t *hap; // transcript haplotype recursion
@@ -440,8 +457,94 @@ typedef struct _args_t
}
args_t;
+// Generated with misc/gencode-tables
// AAA, AAC, ...
-const char *gencode = "KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSSS*CWCLFLF";
+gencode_t gencode_tables[] =
+{
+ {.id=0, .name="Standard sipmlified",
+ .code="KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSSS*CWCLFLF",
+ .stop="--------------M---------------------------------*-*-----*-------" },
+ {.id=1, .name="Standard",
+ .code="KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSSS*CWCLFLF",
+ .stop="--------------M---------------M-----------------*-*-----*-----M-" },
+ {.id=2, .name="Vertebrate Mitochondrial",
+ .code="KNKNTTTT*S*SMIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSSSWCWCLFLF",
+ .stop="--------*-*-MMMM------------------------------M-*-*-------------" },
+ {.id=3, .name="Yeast Mitochondrial",
+ .code="KNKNTTTTRSRSMIMIQHQHPPPPRRRRTTTTEDEDAAAAGGGGVVVV*Y*YSSSSWCWCLFLF",
+ .stop="------------M-M-------------------------------M-*-*-------------" },
+ {.id=4, .name="Mold Mitochondrial; Protozoan Mitochondrial; Coelenterate Mitochondrial; Mycoplasma; Spiroplasma",
+ .code="KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSSSWCWCLFLF",
+ .stop="------------MMMM--------------M---------------M-*-*---------M-M-" },
+ {.id=5, .name="Invertebrate Mitochondrial",
+ .code="KNKNTTTTSSSSMIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSSSWCWCLFLF",
+ .stop="------------MMMM------------------------------M-*-*-----------M-" },
+ {.id=6, .name="Ciliate Nuclear; Dasycladacean Nuclear; Hexamita Nuclear",
+ .code="KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVVQYQYSSSS*CWCLFLF",
+ .stop="--------------M-----------------------------------------*-------" },
+ {.id=9, .name="Echinoderm Mitochondrial; Flatworm Mitochondrial",
+ .code="NNKNTTTTSSSSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSSSWCWCLFLF",
+ .stop="--------------M-------------------------------M-*-*-------------" },
+ {.id=10, .name="Euplotid Nuclear",
+ .code="KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSSSCCWCLFLF",
+ .stop="--------------M---------------------------------*-*-------------" },
+ {.id=11, .name="Bacterial, Archaeal and Plant Plastid",
+ .code="KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSSS*CWCLFLF",
+ .stop="------------MMMM--------------M---------------M-*-*-----*-----M-" },
+ {.id=12, .name="Alternative Yeast Nuclear",
+ .code="KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLSLEDEDAAAAGGGGVVVV*Y*YSSSS*CWCLFLF",
+ .stop="--------------M---------------M-----------------*-*-----*-------" },
+ {.id=13, .name="Ascidian Mitochondrial",
+ .code="KNKNTTTTGSGSMIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSSSWCWCLFLF",
+ .stop="------------M-M-------------------------------M-*-*-----------M-" },
+ {.id=14, .name="Alternative Flatworm Mitochondrial",
+ .code="NNKNTTTTSSSSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVVYY*YSSSSWCWCLFLF",
+ .stop="--------------M-----------------------------------*-------------" },
+ {.id=15, .name="Blepharisma Nuclear",
+ .code="KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*YQYSSSS*CWCLFLF",
+ .stop="--------------M---------------------------------*-------*-------" },
+ {.id=16, .name="Chlorophycean Mitochondrial",
+ .code="KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*YLYSSSS*CWCLFLF",
+ .stop="--------------M---------------------------------*-------*-------" },
+ {.id=21, .name="Trematode Mitochondrial",
+ .code="NNKNTTTTSSSSMIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSSSWCWCLFLF",
+ .stop="--------------M-------------------------------M-*-*-------------" },
+ {.id=22, .name="Scenedesmus obliquus Mitochondrial Code",
+ .code="KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*YLY*SSS*CWCLFLF",
+ .stop="--------------M---------------------------------*---*---*-------" },
+ {.id=23, .name="Thraustochytrium mitochondrial code",
+ .code="KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSSS*CWC*FLF",
+ .stop="--------------MM------------------------------M-*-*-----*---*---" },
+ {.id=24, .name="Pterobranchia Mitochondrial",
+ .code="KNKNTTTTSSKSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSSSWCWCLFLF",
+ .stop="--------------M---------------M---------------M-*-*-----------M-" },
+ {.id=25, .name="Candidate Division SR1 and Gracilibacteria",
+ .code="KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSSSGCWCLFLF",
+ .stop="--------------M-------------------------------M-*-*-----------M-" },
+ {.id=26, .name="Pachysolen tannophilus Nuclear Code",
+ .code="KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLALEDEDAAAAGGGGVVVV*Y*YSSSS*CWCLFLF",
+ .stop="--------------M---------------M-----------------*-*-----*-------" },
+ {.id=27, .name="Karyorelict Nuclear",
+ .code="KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVVQYQYSSSSWCWCLFLF",
+ .stop="--------------M-----------------------------------------*-------" },
+ {.id=28, .name="Condylostoma Nuclear",
+ .code="KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVVQYQYSSSSWCWCLFLF",
+ .stop="--------------M---------------------------------*-*-----*-------" },
+ {.id=29, .name="Mesodinium Nuclear",
+ .code="KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVVYYYYSSSS*CWCLFLF",
+ .stop="--------------M-----------------------------------------*-------" },
+ {.id=30, .name="Peritrich Nuclear",
+ .code="KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVVEYEYSSSS*CWCLFLF",
+ .stop="--------------M-----------------------------------------*-------" },
+ {.id=31, .name="Blastocrithidia Nuclear",
+ .code="KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVVEYEYSSSSWCWCLFLF",
+ .stop="--------------M---------------------------------*-*-------------" },
+ {.id=33, .name="Cephalodiscidae Mitochondrial UAA-Tyr",
+ .code="KNKNTTTTSSKSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVVYY*YSSSSWCWCLFLF",
+ .stop="--------------M---------------M---------------M---*-----------M-" },
+ {.id=-1, .name=NULL, .code=NULL, .stop=NULL}
+};
+gencode_t *gencode = NULL;
const uint8_t nt4[] =
{
4,4,4,4, 4,4,4,4, 4,4,4,4, 4,4,4,4,
@@ -464,8 +567,10 @@ const uint8_t cnt4[] =
4,3,4,2, 4,4,4,1, 4,4,4,4, 4,4,4,4,
4,4,4,4, 0
};
-#define dna2aa(x) gencode[ nt4[(uint8_t)(x)[0]]<<4 | nt4[(uint8_t)(x)[1]]<<2 | nt4[(uint8_t)(x)[2]] ]
-#define cdna2aa(x) gencode[ cnt4[(uint8_t)(x)[2]]<<4 | cnt4[(uint8_t)(x)[1]]<<2 | cnt4[(uint8_t)(x)[0]] ]
+#define dna2aa(x) gencode->code[ nt4[(uint8_t)(x)[0]]<<4 | nt4[(uint8_t)(x)[1]]<<2 | nt4[(uint8_t)(x)[2]] ]
+#define cdna2aa(x) gencode->code[ cnt4[(uint8_t)(x)[2]]<<4 | cnt4[(uint8_t)(x)[1]]<<2 | cnt4[(uint8_t)(x)[0]] ]
+#define dna2stop(x) gencode->stop[ nt4[(uint8_t)(x)[0]]<<4 | nt4[(uint8_t)(x)[1]]<<2 | nt4[(uint8_t)(x)[2]] ]
+#define cdna2stop(x) gencode->stop[ cnt4[(uint8_t)(x)[2]]<<4 | cnt4[(uint8_t)(x)[1]]<<2 | cnt4[(uint8_t)(x)[0]] ]
static inline int ncsq2_to_nfmt(int ncsq2)
{
@@ -477,6 +582,111 @@ static inline void icsq2_to_bit(int icsq2, int *ival, int *ibit)
*ibit = icsq2 % 30;
}
+static void init_gencode(args_t *args)
+{
+ int i,j,k;
+ if ( !args->gencode_str ) args->gencode_str = "0";
+ if ( !strcasecmp("l",args->gencode_str) )
+ {
+ printf("# The tables are ordered by codon as AAA,AAC,AAG,...,ACA,ACC,...,TTT:\n");
+ printf("#\n#\t");
+ for (i=0; i<4; i++)
+ {
+ for (j=0; j<16; j++) printf("%c","ACGT"[i]);
+ }
+ printf("\n#\t");
+ for (i=0; i<4; i++)
+ {
+ for (j=0; j<4; j++)
+ {
+ for (k=0; k<4; k++) printf("%c","ACGT"[j]);
+ }
+ }
+ printf("\n#\t");
+ for (i=0; i<16; i++)
+ {
+ for (k=0; k<4; k++) printf("%c","ACGT"[k]);
+ }
+ printf("\n#\n\n");
+
+ for (i=0; gencode_tables[i].id >= 0; i++)
+ {
+ gencode_t *gc = &gencode_tables[i];
+ printf("%d\t%s\n\t%s\n\t%s\n\n",gc->id,gc->name,gc->code,gc->stop);
+ }
+ exit(0);
+ }
+ char *tmp;
+ int id = strtol(args->gencode_str,&tmp,10);
+ if ( *tmp ) error("Could not parse argument: --genetic-code %s\n",args->gencode_str);
+ for (i=0; gencode_tables[i].id >= 0; i++)
+ {
+ gencode_t *gc = &gencode_tables[i];
+ if ( gc->id==id )
+ {
+ gencode = gc;
+ break;
+ }
+ }
+ if ( !gencode ) error("Could not parse argument: --genetic-code %s, no such table\n",args->gencode_str);
+}
+void test_prefix(kstring_t *str, const char *seq)
+{
+ if ( !strncasecmp(seq,"chromosome_",11) ) kputsn(seq,11,str);
+ else if ( !strncasecmp(seq,"chromosome",10) ) kputsn(seq,10,str);
+ else if ( !strncasecmp(seq,"chrom_",6) ) kputsn(seq,6,str);
+ else if ( !strncasecmp(seq,"chrom",5) ) kputsn(seq,5,str);
+ else if ( !strncasecmp(seq,"chr_",4) ) kputsn(seq,4,str);
+ else if ( !strncasecmp(seq,"chr",3) ) kputsn(seq,3,str);
+ else kputs("-",str);
+}
+void init_chr_names(args_t *args)
+{
+ // init chr prefixes to trim
+ int i,n;
+ char **tmp;
+
+ // chr prefixes given explicitly
+ args->unify_chr_names_err = strdup("check if --unify-chr-names or --force could help");
+ if ( args->unify_chr_names && (tmp=hts_readlist(args->unify_chr_names,0,&n)) )
+ {
+ if ( n!=3 ) error("Error: expected three strings, got --unify-chr-names %s\n",args->unify_chr_names);
+ for (i=0; i<3; i++)
+ if ( strcmp("-",tmp[i]) ) args->chr_prefix[i] = tmp[i];
+ else free(tmp[i]);
+ free(tmp);
+ return;
+ }
+
+ int nseq;
+ const char **vcf = bcf_hdr_seqnames(args->hdr, &nseq);
+ if ( !vcf ) return;
+ const char *seq_vcf = vcf[0];
+ const char *seq_gff = gff_iseq(args->gff,0);
+ const char *seq_fa = faidx_iseq(args->fai,0);
+ free(vcf);
+ if ( !strcmp(seq_vcf,seq_fa) && !strcmp(seq_vcf,seq_gff) ) return;
+
+ // First sequences not identical: either they have different prefix or they are in different order.
+ // See if we can suggest the --unify-chr-names parameter to use
+ kstring_t chr_vcf = {0,0,0}, chr_gff = {0,0,0}, chr_fa = {0,0,0}, str = {0,0,0};
+ test_prefix(&chr_vcf, seq_vcf);
+ test_prefix(&chr_gff, seq_gff);
+ test_prefix(&chr_fa, seq_fa);
+ int same_chr = 1;
+ if ( strcmp(!strcmp("-",chr_vcf.s)?seq_vcf:seq_vcf+chr_vcf.l,!strcmp("-",chr_gff.s)?seq_gff:seq_gff+chr_gff.l) ) same_chr = 0;
+ if ( strcmp(!strcmp("-",chr_gff.s)?seq_gff:seq_gff+chr_gff.l,!strcmp("-",chr_fa.s)?seq_fa:seq_fa+chr_fa.l) ) same_chr = 0;
+ if ( strcmp(!strcmp("-",chr_fa.s)?seq_fa:seq_fa+chr_fa.l,!strcmp("-",chr_vcf.s)?seq_vcf:seq_vcf+chr_vcf.l) ) same_chr = 0;
+ free(args->unify_chr_names_err);
+ if ( same_chr )
+ ksprintf(&str,"the first sequence name in VCF/GFF/fasta is %s/%s/%s, try to run with --unify-chr-names %s,%s,%s\n",seq_vcf,seq_gff,seq_fa,chr_vcf.s,chr_gff.s,chr_fa.s);
+ else
+ ksprintf(&str,"the first sequence name in VCF/GFF/fasta is %s/%s/%s, check if running with --unify-chr-names or --force coud help\n",seq_vcf,seq_gff,seq_fa);
+ free(chr_vcf.s);
+ free(chr_gff.s);
+ free(chr_fa.s);
+ args->unify_chr_names_err = str.s;
+}
void init_data(args_t *args)
{
args->nfmt_bcsq = ncsq2_to_nfmt(args->ncsq2_max);
@@ -486,7 +696,6 @@ void init_data(args_t *args)
args->gff = gff_init(args->gff_fname);
gff_set(args->gff,verbosity,args->verbosity);
- gff_set(args->gff,strip_chr_names,args->unify_chr_names);
gff_set(args->gff,force_out_of_phase,args->force);
gff_set(args->gff,dump_fname,args->dump_gff);
gff_parse(args->gff);
@@ -496,6 +705,8 @@ void init_data(args_t *args)
args->idx_tscript = gff_get(args->gff,idx_tscript);
args->itr = regitr_init(NULL);
+ init_chr_names(args);
+
args->rid = -1;
if ( args->filter_str )
@@ -579,6 +790,13 @@ void destroy_data(args_t *args)
kh_destroy(pos2vbuf,args->pos2vbuf);
if ( args->smpl ) smpl_ilist_destroy(args->smpl);
int i,j,ret;
+ for (i=0; i<3; i++)
+ {
+ free(args->chr_prefix[i]);
+ free(args->chr_names[i]);
+ }
+ free(args->chr_name);
+
if ( args->out_fh )
{
if ( args->write_index )
@@ -617,12 +835,14 @@ void destroy_data(args_t *args)
free(args->hap->sseq.s);
free(args->hap->tseq.s);
free(args->hap->tref.s);
+ free(args->hap->tseq_stop.s);
+ free(args->hap->tref_stop.s);
free(args->hap);
fai_destroy(args->fai);
free(args->gt_arr);
free(args->str.s);
free(args->str2.s);
- free(args->chr_name);
+ free(args->unify_chr_names_err);
}
/*
@@ -666,6 +886,7 @@ void splice_init(splice_t *splice, bcf1_t *rec)
}
static inline void splice_build_hap(splice_t *splice, uint32_t beg, int len)
{
+ // beg .. the beggining of the splice region
// len>0 .. beg is the first base, del filled from right
// len<0 .. beg is the last base, del filled from left
@@ -681,8 +902,24 @@ static inline void splice_build_hap(splice_t *splice, uint32_t beg, int len)
}
else
{
- rbeg = abeg = beg;
- rlen = alen = len;
+ if ( beg < splice->tr->beg )
+ {
+ // This can happen with very short exons and introns. Not a real biology, but the program
+ // should not crash on it. This is not a real fix, the code would need a revamp to handle
+ // well cases like this, see test/csq/ENSCAFT00000047742
+ // >chr9:104-110
+ // ATGTCAGGGCC
+ // ATGTC-GGGCC
+ // 456
+ // eee.eee
+ rbeg = abeg = splice->tr->beg;
+ rlen = alen = 0;
+ }
+ else
+ {
+ rbeg = abeg = beg;
+ rlen = alen = len;
+ }
// check for incomplete del as above??
}
@@ -808,20 +1045,32 @@ fprintf(stderr,"csq_stage_splice %d: type=%d\n",(int)rec->pos+1,type);
csq.type.gene = tr->gene->name;
csq_stage(args, &csq, rec);
}
-static inline const char *drop_chr_prefix(args_t *args, const char *chr)
+static inline const char *unify_chr_name(args_t *args, const char *chr, int isrc, int idst)
{
- if ( !args->unify_chr_names ) return chr;
- if ( !strncasecmp("chr",chr,3) ) return chr+3;
- return chr;
-}
-static inline const char *add_chr_prefix(args_t *args, const char *chr)
-{
- if ( !args->unify_chr_names ) return chr;
- int len = strlen(chr);
- hts_expand(char,len+4,args->mchr_name,args->chr_name);
- memcpy(args->chr_name,"chr",3);
- memcpy(args->chr_name+3,chr,len+1);
- return args->chr_name;
+ if ( !args->chr_prefix[isrc] && !args->chr_prefix[idst] ) return chr;
+
+ int off = 0, len = strlen(chr);
+ if ( args->chr_prefix[isrc] )
+ {
+ off = strlen(args->chr_prefix[isrc]);
+ len -= off;
+ if ( strncmp(args->chr_prefix[isrc],chr,off) )
+ error("Error: failed to unify chr names, cannot strip \"%s\" from \"%s\"\n",args->chr_prefix[isrc],chr);
+ }
+ hts_expand(char,len+1,args->mchr_name,args->chr_name);
+ memcpy(args->chr_name,chr+off,len+1);
+
+ if ( args->chr_prefix[idst] )
+ {
+ off = strlen(args->chr_prefix[idst]);
+ hts_expand(char,len+off+1,args->mchr_name,args->chr_name);
+ memmove(args->chr_name+off,args->chr_name,len+1);
+ memcpy(args->chr_name,args->chr_prefix[idst],off);
+ }
+
+ free(args->chr_names[idst]);
+ args->chr_names[idst] = strdup(args->chr_name);
+ return args->chr_names[idst];
}
static inline int splice_csq_ins(args_t *args, splice_t *splice, uint32_t ex_beg, uint32_t ex_end)
{
@@ -848,7 +1097,7 @@ fprintf(stderr,"ins: %s>%s .. ex=%d,%d beg,end=%d,%d tbeg,tend=%d,%d check_ut
if ( splice->check_utr )
{
regitr_t *itr = regitr_init(NULL);
- const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,splice->vcf.rec));
+ const char *chr = unify_chr_name(args, bcf_seqname(args->hdr,splice->vcf.rec),CHR_VCF,CHR_GFF);
if ( regidx_overlap(args->idx_utr,chr,splice->ref_beg+1,splice->ref_beg+1, itr) ) // adjacent utr
{
ret = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq, splice->vcf.ial);
@@ -886,7 +1135,7 @@ fprintf(stderr,"ins: %s>%s .. ex=%d,%d beg,end=%d,%d tbeg,tend=%d,%d check_ut
if ( splice->check_utr )
{
regitr_t *itr = regitr_init(NULL);
- const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,splice->vcf.rec));
+ const char *chr = unify_chr_name(args, bcf_seqname(args->hdr,splice->vcf.rec),CHR_VCF,CHR_GFF);
if ( regidx_overlap(args->idx_utr,chr,splice->ref_end-1,splice->ref_end-1, itr) ) // adjacent utr
{
ret = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq, splice->vcf.ial);
@@ -1065,7 +1314,7 @@ fprintf(stderr,"splice_csq_del: %s>%s .. ex=%d,%d beg,end=%d,%d tbeg,tend=%d,%
if ( splice->check_utr )
{
regitr_t *itr = regitr_init(NULL);
- const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,splice->vcf.rec));
+ const char *chr = unify_chr_name(args, bcf_seqname(args->hdr,splice->vcf.rec),CHR_VCF,CHR_GFF);
if ( regidx_overlap(args->idx_utr,chr,splice->ref_beg,ex_beg-1, itr) ) // adjacent utr
csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq, splice->vcf.ial);
regitr_destroy(itr);
@@ -1093,7 +1342,9 @@ fprintf(stderr,"splice_csq_del: %s>%s .. ex=%d,%d beg,end=%d,%d tbeg,tend=%d,%
{
if ( splice->check_donor && splice->tr->strand==STRAND_REV ) splice->csq |= CSQ_SPLICE_DONOR;
if ( splice->check_acceptor && splice->tr->strand==STRAND_FWD ) splice->csq |= CSQ_SPLICE_ACCEPTOR;
- if ( ref && alt && !strncmp(ref+N_SPLICE_REGION_INTRON-N_SPLICE_DONOR,alt+N_SPLICE_REGION_INTRON-N_SPLICE_DONOR,N_SPLICE_DONOR) ) splice->csq |= CSQ_SYNONYMOUS_VARIANT;
+
+ int noff = N_SPLICE_REGION_INTRON - N_SPLICE_DONOR;
+ if ( ref && alt && noffkref.l && noffkalt.l && !strncmp(ref+noff,alt+noff,N_SPLICE_DONOR) ) splice->csq |= CSQ_SYNONYMOUS_VARIANT;
}
}
}
@@ -1121,7 +1372,7 @@ fprintf(stderr,"splice_csq_del: %s>%s .. ex=%d,%d beg,end=%d,%d tbeg,tend=%d,%
if ( splice->check_utr )
{
regitr_t *itr = regitr_init(NULL);
- const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,splice->vcf.rec));
+ const char *chr = unify_chr_name(args, bcf_seqname(args->hdr,splice->vcf.rec),CHR_VCF,CHR_GFF);
if ( regidx_overlap(args->idx_utr,chr,ex_end+1,splice->ref_end, itr) ) // adjacent utr
csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq, splice->vcf.ial);
regitr_destroy(itr);
@@ -1212,7 +1463,7 @@ fprintf(stderr,"mnp: %s>%s .. ex=%d,%d beg,end=%d,%d tbeg,tend=%d,%d check_ut
if ( splice->check_utr )
{
regitr_t *itr = regitr_init(NULL);
- const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,splice->vcf.rec));
+ const char *chr = unify_chr_name(args, bcf_seqname(args->hdr,splice->vcf.rec),CHR_VCF,CHR_GFF);
if ( regidx_overlap(args->idx_utr,chr,splice->ref_beg,ex_beg-1, itr) ) // adjacent utr
csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq, splice->vcf.ial);
regitr_destroy(itr);
@@ -1242,7 +1493,7 @@ fprintf(stderr,"mnp: %s>%s .. ex=%d,%d beg,end=%d,%d tbeg,tend=%d,%d check_ut
if ( splice->check_utr )
{
regitr_t *itr = regitr_init(NULL);
- const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,splice->vcf.rec));
+ const char *chr = unify_chr_name(args, bcf_seqname(args->hdr,splice->vcf.rec),CHR_VCF,CHR_GFF);
if ( regidx_overlap(args->idx_utr,chr,ex_end+1,splice->ref_end, itr) ) // adjacent utr
csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq, splice->vcf.ial);
regitr_destroy(itr);
@@ -1348,10 +1599,12 @@ int hap_init(args_t *args, hap_node_t *parent, hap_node_t *child, gf_cds_t *cds,
if ( tr->strand==STRAND_FWD ) { if ( child->icds==tr->ncds-1 ) splice.check_stop = 1; }
else if ( tr->strand==STRAND_REV ) { if ( child->icds==0 ) splice.check_stop = 1; }
}
- if ( splice.check_start ) // do not check starts in incomplete CDS, defined as not starting with M
+ if ( splice.check_start )
{
- if ( tr->strand==STRAND_FWD ) { if ( dna2aa(TSCRIPT_AUX(tr)->ref+N_REF_PAD+cds->beg-tr->beg) != 'M' ) splice.check_start = 0; }
- else if ( tr->strand==STRAND_REV ) { if ( cdna2aa(TSCRIPT_AUX(tr)->ref+N_REF_PAD+cds->beg-tr->beg+cds->len-3) != 'M' ) splice.check_start = 0; }
+ // Do not check starts in incomplete CDS, defined as not starting with M
+ // Not this is not always true, there are alternative start codons
+ if ( tr->strand==STRAND_FWD ) { if ( dna2stop(TSCRIPT_AUX(tr)->ref+N_REF_PAD+cds->beg-tr->beg) != 'M' ) splice.check_start = 0; }
+ else if ( tr->strand==STRAND_REV ) { if ( cdna2stop(TSCRIPT_AUX(tr)->ref+N_REF_PAD+cds->beg-tr->beg+cds->len-3) != 'M' ) splice.check_start = 0; }
}
if ( child->icds!=0 ) splice.check_region_beg = 1;
if ( child->icds!=tr->ncds-1 ) splice.check_region_end = 1;
@@ -1365,7 +1618,7 @@ fprintf(stderr,"cds splice_csq: %d [%s][%s] .. beg,end=%d %d, ret=%d, csq=%d\n\n
#endif
if ( ret==SPLICE_VAR_REF ) return 2; // not a variant, eg REF=CA ALT=CA
- if ( ret==SPLICE_OUTSIDE || ret==SPLICE_OVERLAP || splice.csq==CSQ_START_LOST ) // not a coding csq
+ if ( ret==SPLICE_OUTSIDE || ret==SPLICE_OVERLAP ) // not a coding csq
{
free(splice.kref.s);
free(splice.kalt.s);
@@ -1493,7 +1746,7 @@ void hap_destroy(hap_node_t *hap)
tseq: translated sequence (aa)
fill: frameshift, fill until the end (strand=fwd) or from the start (strand=rev)
*/
-void cds_translate(kstring_t *_ref, kstring_t *_seq, uint32_t sbeg, uint32_t rbeg, uint32_t rend, int strand, kstring_t *tseq, int fill)
+void cds_translate(kstring_t *_ref, kstring_t *_seq, uint32_t sbeg, uint32_t rbeg, uint32_t rend, int strand, kstring_t *tseq, kstring_t *tseq_stop, int fill)
{
#if XDBG
fprintf(stderr,"\ntranslate: %d %d %d fill=%d seq.l=%d\n",sbeg,rbeg,rend,fill,(int)_seq->l);
@@ -1505,9 +1758,11 @@ fprintf(stderr,"\ntranslate: %d %d %d fill=%d seq.l=%d\n",sbeg,rbeg,rend,fill,
kstring_t seq = *_seq;
tseq->l = 0;
+ tseq_stop->l = 0;
if ( !seq.l )
{
kputc('?', tseq);
+ kputc('?', tseq_stop);
return;
}
@@ -1541,6 +1796,7 @@ fprintf(stderr,"\ntranslate: %d %d %d fill=%d seq.l=%d\n",sbeg,rbeg,rend,fill,
if ( i==3 )
{
kputc_(dna2aa(tmp), tseq);
+ kputc_(dna2stop(tmp), tseq_stop);
#if DBG>1
fprintf(stderr,"[1]%c%c%c\n",tmp[0],tmp[1],tmp[2]);
#endif
@@ -1549,6 +1805,7 @@ fprintf(stderr,"\ntranslate: %d %d %d fill=%d seq.l=%d\n",sbeg,rbeg,rend,fill,
while ( codon < end )
{
kputc_(dna2aa(codon), tseq);
+ kputc_(dna2stop(codon), tseq_stop);
#if DBG>1
fprintf(stderr,"[2]%c%c%c\n",codon[0],codon[1],codon[2]);
#endif
@@ -1572,6 +1829,7 @@ fprintf(stderr,"\ntranslate: %d %d %d fill=%d seq.l=%d\n",sbeg,rbeg,rend,fill,
codon++;
}
kputc_(dna2aa(tmp), tseq);
+ kputc_(dna2stop(tmp), tseq_stop);
#if DBG>1
fprintf(stderr,"[4]%c%c%c\n",tmp[0],tmp[1],tmp[2]);
#endif
@@ -1582,6 +1840,7 @@ fprintf(stderr,"\ntranslate: %d %d %d fill=%d seq.l=%d\n",sbeg,rbeg,rend,fill,
while ( codon+3 <= end )
{
kputc_(dna2aa(codon), tseq);
+ kputc_(dna2stop(codon), tseq_stop);
#if DBG>1
fprintf(stderr,"[5]%c%c%c\t%c\n",codon[0],codon[1],codon[2],dna2aa(codon));
#endif
@@ -1626,10 +1885,12 @@ fprintf(stderr,"\ntranslate: %d %d %d fill=%d seq.l=%d\n",sbeg,rbeg,rend,fill,
fprintf(stderr,"[1]%c%c%c\t%c\n",tmp[0],tmp[1],tmp[2], cdna2aa(tmp));
#endif
kputc_(cdna2aa(tmp), tseq);
+ kputc_(cdna2stop(tmp), tseq_stop);
codon = end - 3;
while ( codon >= seq.s )
{
kputc_(cdna2aa(codon), tseq);
+ kputc_(cdna2stop(codon), tseq_stop);
#if DBG>1
fprintf(stderr,"[2]%c%c%c\t%c\n",codon[0],codon[1],codon[2], cdna2aa(codon));
#endif
@@ -1659,6 +1920,7 @@ fprintf(stderr,"\ntranslate: %d %d %d fill=%d seq.l=%d\n",sbeg,rbeg,rend,fill,
{
for (; i>=0 && end>=ref.s; i--) tmp[i] = *(--end);
kputc_(cdna2aa(tmp), tseq);
+ kputc_(cdna2stop(tmp), tseq_stop);
#if DBG>1
fprintf(stderr,"[4]%c%c%c\t%c\n",tmp[0],tmp[1],tmp[2],cdna2aa(tmp));
#endif
@@ -1669,6 +1931,7 @@ fprintf(stderr,"\ntranslate: %d %d %d fill=%d seq.l=%d\n",sbeg,rbeg,rend,fill,
while ( codon >= ref.s + N_REF_PAD )
{
kputc_(cdna2aa(codon), tseq);
+ kputc_(cdna2stop(codon), tseq_stop);
#if DBG>1
fprintf(stderr,"[5]%c%c%c\t%c\n",codon[0],codon[1],codon[2],cdna2aa(codon));
#endif
@@ -1678,6 +1941,7 @@ fprintf(stderr,"\ntranslate: %d %d %d fill=%d seq.l=%d\n",sbeg,rbeg,rend,fill,
}
else error("Should not happen: %d\n", strand);
kputc_(0,tseq); tseq->l--;
+ kputc_(0,tseq_stop); tseq_stop->l--;
#if DBG
fprintf(stderr," tseq: %s\n", tseq->s);
#endif
@@ -1771,6 +2035,9 @@ fprintf(stderr,"csq_push: %d .. %d\n",(int)rec->pos+1,csq->type.type);
if ( vrec->vcsq[i].type&CSQ_STOP_RETAINED )
vrec->vcsq[i].type &= ~(CSQ_STOP_LOST|CSQ_SYNONYMOUS_VARIANT);
+ if ( vrec->vcsq[i].type&CSQ_START_RETAINED )
+ vrec->vcsq[i].type &= ~(CSQ_START_LOST|CSQ_SYNONYMOUS_VARIANT);
+
if ( !vrec->vcsq[i].vstr.s ) vrec->vcsq[i].vstr = csq->type.vstr;
goto exit_duplicate;
}
@@ -1868,14 +2135,14 @@ void kput_vcsq(args_t *args, vcsq_t *csq, kstring_t *str)
kputs(csq->vstr.s, str);
}
-void kprint_aa_prediction(args_t *args, int beg, kstring_t *aa, kstring_t *str)
+void kprint_aa_prediction(args_t *args, int beg, kstring_t *aa, kstring_t *stop, kstring_t *str)
{
if ( !args->brief_predictions || (int)aa->l - args->brief_predictions < 3 )
kputs(aa->s, str);
else
{
int i, len = aa->l;
- if ( aa->s[len-1]=='*' ) len--;
+ if ( stop->s[len-1]=='*' ) len--;
for (i=0; ibrief_predictions; i++) kputc(aa->s[i], str);
kputs("..", str);
kputw(beg+len, str);
@@ -1909,33 +2176,37 @@ void hap_add_csq(args_t *args, hap_t *hap, hap_node_t *node, int tlen, int ibeg,
if ( hap->stack[ibeg].node->type != HAP_SSS )
{
// check for truncating stops
- for (i=0; itref.l; i++)
- if ( hap->tref.s[i]=='*' ) break;
- if ( i!=hap->tref.l )
+ for (i=0; itref_stop.l; i++)
+ if ( hap->tref_stop.s[i]=='*' ) break;
+ if ( i!=hap->tref_stop.l )
{
hap->tref.l = i+1;
hap->tref.s[i+1] = 0;
+ hap->tref_stop.l = i+1;
+ hap->tref_stop.s[i+1] = 0;
}
- for (i=0; itseq.l; i++)
- if ( hap->tseq.s[i]=='*' ) break;
+ for (i=0; itseq_stop.l; i++)
+ if ( hap->tseq_stop.s[i]=='*' ) break;
if ( i!=hap->tseq.l )
{
hap->tseq.l = i+1;
hap->tseq.s[i+1] = 0;
+ hap->tseq_stop.l = i+1;
+ hap->tseq_stop.s[i+1] = 0;
hap->upstream_stop = 1;
}
if ( csq->type.type & CSQ_STOP_LOST )
{
- if ( hap->tref.s[hap->tref.l-1]=='*' && hap->tref.s[hap->tref.l-1] == hap->tseq.s[hap->tseq.l-1] )
+ if ( hap->tref_stop.s[hap->tref_stop.l-1]=='*' && hap->tref_stop.s[hap->tref_stop.l-1] == hap->tseq_stop.s[hap->tseq_stop.l-1] )
{
rm_csq |= CSQ_STOP_LOST;
csq->type.type |= CSQ_STOP_RETAINED;
}
- else if ( hap->tref.s[hap->tref.l-1]!='*' )
+ else if ( hap->tref_stop.s[hap->tref_stop.l-1]!='*' )
{
// This is CDS 3' incomplete ENSG00000173376/synon.vcf, can also be missense
// We observe in real data a change to a stop, ENST00000528237/retained-stop-incomplete-cds.vcf
- if ( hap->tseq.s[hap->tseq.l-1] == '*' )
+ if ( hap->tseq_stop.s[hap->tseq_stop.l-1] == '*' )
{
rm_csq |= CSQ_STOP_GAINED;
csq->type.type |= CSQ_STOP_RETAINED;
@@ -1944,10 +2215,13 @@ void hap_add_csq(args_t *args, hap_t *hap, hap_node_t *node, int tlen, int ibeg,
csq->type.type |= CSQ_INCOMPLETE_CDS;
}
}
- if ( csq->type.type & CSQ_START_LOST && hap->tref.s[0]!='M' )
+ if ( csq->type.type & CSQ_START_LOST )
{
- rm_csq |= CSQ_START_LOST;
- csq->type.type &= ~CSQ_START_LOST;
+ if ( hap->tref_stop.s[hap->tref_stop.l-1]=='M' && hap->tref_stop.s[hap->tref_stop.l-1] == hap->tseq_stop.s[hap->tseq_stop.l-1] )
+ {
+ rm_csq |= CSQ_START_LOST;
+ csq->type.type |= CSQ_START_RETAINED;
+ }
}
if ( dlen!=0 )
{
@@ -1957,7 +2231,7 @@ void hap_add_csq(args_t *args, hap_t *hap, hap_node_t *node, int tlen, int ibeg,
csq->type.type |= CSQ_INFRAME_DELETION;
else
csq->type.type |= CSQ_INFRAME_INSERTION;
- if ( hap->tref.s[hap->tref.l-1]!='*' && hap->tseq.s[hap->tseq.l-1]=='*' )
+ if ( hap->tref_stop.s[hap->tref_stop.l-1]!='*' && hap->tseq_stop.s[hap->tseq_stop.l-1]=='*' )
csq->type.type |= CSQ_STOP_GAINED;
}
else
@@ -1967,9 +2241,9 @@ void hap_add_csq(args_t *args, hap_t *hap, hap_node_t *node, int tlen, int ibeg,
{
if ( hap->tref.s[i] == hap->tseq.s[i] ) continue;
aa_change = 1;
- if ( hap->tref.s[i] == '*' )
+ if ( hap->tref_stop.s[i] == '*' )
csq->type.type |= CSQ_STOP_LOST;
- else if ( hap->tseq.s[i] == '*' )
+ else if ( hap->tseq_stop.s[i] == '*' )
csq->type.type |= CSQ_STOP_GAINED;
else
csq->type.type |= CSQ_MISSENSE_VARIANT;
@@ -1979,11 +2253,19 @@ void hap_add_csq(args_t *args, hap_t *hap, hap_node_t *node, int tlen, int ibeg,
}
}
// Check if compound inframe variants are real inframes, or if the stop codon occurs before the frameshift can be restored
- if ( ibeg!=iend && (csq->type.type & (CSQ_INFRAME_DELETION|CSQ_INFRAME_INSERTION|CSQ_INFRAME_ALTERING)) && hap->tseq.s[hap->tseq.l-1]=='*' )
+ if ( ibeg!=iend && (csq->type.type & (CSQ_INFRAME_DELETION|CSQ_INFRAME_INSERTION|CSQ_INFRAME_ALTERING)) && hap->tseq_stop.s[hap->tseq_stop.l-1]=='*' )
{
rm_csq |= CSQ_INFRAME_DELETION | CSQ_INFRAME_INSERTION | CSQ_INFRAME_ALTERING;
csq->type.type |= CSQ_FRAMESHIFT_VARIANT | CSQ_STOP_GAINED;
}
+ if ( csq->type.type & CSQ_FRAMESHIFT_VARIANT && csq->type.type & CSQ_START_LOST )
+ {
+ // this is to prevent
+ // 4959 GA G start_lost|NBPF3|ENST00000318249|protein_coding|+
+ // 4959 GA G start_lost|NBPF3|ENST00000318249|protein_coding|+|1M>1?|4959GA>G
+ rm_csq |= CSQ_FRAMESHIFT_VARIANT;
+ hap->stack[ibeg].node->type = HAP_SSS;
+ }
if ( has_upstream_stop ) csq->type.type |= CSQ_UPSTREAM_STOP;
csq->type.type &= ~rm_csq;
@@ -2004,12 +2286,12 @@ void hap_add_csq(args_t *args, hap_t *hap, hap_node_t *node, int tlen, int ibeg,
int aa_sbeg = tr->strand==STRAND_FWD ? node2sbeg(ibeg)/3+1 : (tlen - node2send(iend))/3+1;
kputc_('|', &str);
kputw(aa_rbeg, &str);
- kprint_aa_prediction(args,aa_rbeg,&hap->tref,&str);
+ kprint_aa_prediction(args,aa_rbeg,&hap->tref,&hap->tref_stop,&str);
if ( !(csq->type.type & CSQ_SYNONYMOUS_VARIANT) )
{
kputc_('>', &str);
kputw(aa_sbeg, &str);
- kprint_aa_prediction(args,aa_sbeg,&hap->tseq,&str);
+ kprint_aa_prediction(args,aa_sbeg,&hap->tseq,&hap->tseq_stop,&str);
}
kputc_('|', &str);
@@ -2080,6 +2362,7 @@ void hap_finalize(args_t *args, hap_t *hap)
hap->sseq.l = 0;
hap->tseq.l = 0;
+ hap->tseq_stop.l = 0;
hap->stack[0].node = TSCRIPT_AUX(tr)->root;
hap->stack[0].ichild = -1;
hap->stack[0].slen = 0;
@@ -2167,13 +2450,13 @@ void hap_finalize(args_t *args, hap_t *hap)
}
else // splice site overlap, see #1475227917
sseq.l = fill = 0;
- cds_translate(&sref, &sseq, icur,rbeg,rend, tr->strand, &hap->tseq, fill);
+ cds_translate(&sref, &sseq, icur,rbeg,rend, tr->strand, &hap->tseq, &hap->tseq_stop, fill);
// ref
sseq.l = node2rend(i) - rbeg;
sseq.s = sref.s + N_REF_PAD + rbeg;
sseq.m = sref.m - 2*N_REF_PAD;
- cds_translate(&sref, &sseq, rbeg,rbeg,rend, tr->strand, &hap->tref, fill);
+ cds_translate(&sref, &sseq, rbeg,rbeg,rend, tr->strand, &hap->tref, &hap->tref_stop, fill);
sseq.m = sref.m - 2*N_REF_PAD + hap->stack[istack].dlen;
hap_add_csq(args,hap,node,0, ibeg,i,dlen,indel);
@@ -2225,13 +2508,13 @@ void hap_finalize(args_t *args, hap_t *hap)
}
else // splice site overlap, see #1475227917
sseq.l = fill = 0;
- cds_translate(&sref, &sseq, icur,rbeg,rend, tr->strand, &hap->tseq, fill);
+ cds_translate(&sref, &sseq, icur,rbeg,rend, tr->strand, &hap->tseq, &hap->tseq_stop, fill);
// ref
sseq.l = node2rend(ibeg) - rbeg;
sseq.s = sref.s + N_REF_PAD + rbeg;
sseq.m = sref.m - 2*N_REF_PAD;
- cds_translate(&sref, &sseq, rbeg,rbeg,rend, tr->strand, &hap->tref, fill);
+ cds_translate(&sref, &sseq, rbeg,rbeg,rend, tr->strand, &hap->tref, &hap->tref_stop, fill);
sseq.m = sref.m - 2*N_REF_PAD + hap->stack[istack].dlen;
hap_add_csq(args,hap,node,sseq.m, i,ibeg,dlen,indel);
@@ -2489,13 +2772,7 @@ void tscript_init_ref(args_t *args, gf_tscript_t *tr, const char *chr)
int i, len;
int pad_beg = tr->beg >= N_REF_PAD ? N_REF_PAD : tr->beg;
- const char *tmp_chr = chr;
- if ( !faidx_has_seq(args->fai,tmp_chr) )
- {
- tmp_chr = drop_chr_prefix(args,chr);
- if ( !faidx_has_seq(args->fai,tmp_chr) ) tmp_chr = add_chr_prefix(args,chr);
- }
- TSCRIPT_AUX(tr)->ref = faidx_fetch_seq(args->fai, tmp_chr, tr->beg - pad_beg, tr->end + N_REF_PAD, &len);
+ TSCRIPT_AUX(tr)->ref = faidx_fetch_seq(args->fai, chr, tr->beg - pad_beg, tr->end + N_REF_PAD, &len);
if ( !TSCRIPT_AUX(tr)->ref )
error("faidx_fetch_seq failed %s:%d-%d\n", chr,tr->beg+1,tr->end+1);
@@ -2513,7 +2790,8 @@ void tscript_init_ref(args_t *args, gf_tscript_t *tr, const char *chr)
}
}
-static void sanity_check_ref(args_t *args, gf_tscript_t *tr, bcf1_t *rec)
+// returns 0 on success, negative number on reference mismatch
+static int sanity_check_ref(args_t *args, gf_tscript_t *tr, bcf1_t *rec)
{
int vbeg = 0;
int rbeg = rec->pos - tr->beg + N_REF_PAD;
@@ -2525,23 +2803,40 @@ static void sanity_check_ref(args_t *args, gf_tscript_t *tr, bcf1_t *rec)
while ( ref[i] && vcf[i] )
{
if ( ref[i]!=vcf[i] && toupper(ref[i])!=toupper(vcf[i]) )
- error("Error: the fasta reference does not match the VCF REF allele at %s:%"PRId64" .. fasta=%c vcf=%c\n",
- bcf_seqname(args->hdr,rec),(int64_t) rec->pos+vbeg+1,ref[i],vcf[i]);
+ {
+ if ( !args->force )
+ error("Error: the fasta reference does not match the VCF REF allele at %s:%"PRId64" .. fasta=%c vcf=%c\n",
+ bcf_seqname(args->hdr,rec),(int64_t) rec->pos+vbeg+1,ref[i],vcf[i]);
+
+ else if ( args->verbosity && (!args->warned.ref_allele_mismatch || args->verbosity > 1) )
+ {
+ fprintf(stderr,"Warning: the fasta reference does not match the VCF REF allele at %s:%"PRId64" .. fasta=%c vcf=%c\n",
+ bcf_seqname(args->hdr,rec),(int64_t) rec->pos+vbeg+1,ref[i],vcf[i]);
+ if ( args->verbosity < 2 )
+ fprintf(stderr," This message is printed only once, the verbosity can be increased with `--verbosity 2`\n");
+ }
+ args->warned.ref_allele_mismatch++;
+ return -1;
+ }
i++;
}
+ return 0;
}
int test_cds_local(args_t *args, bcf1_t *rec)
{
int i,j, ret = 0;
- const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,rec));
+ const char *chr_vcf = bcf_seqname(args->hdr,rec);
+ const char *chr_gff = unify_chr_name(args, chr_vcf, CHR_VCF,CHR_GFF);
+ const char *chr_fai = unify_chr_name(args, chr_vcf, CHR_VCF,CHR_FAI);
// note that the off-by-one extension of rlen is deliberate to account for insertions
- if ( !regidx_overlap(args->idx_cds,chr,rec->pos,rec->pos+rec->rlen, args->itr) ) return 0;
+ if ( !regidx_overlap(args->idx_cds,chr_gff,rec->pos,rec->pos+rec->rlen, args->itr) ) return 0;
// structures to fake the normal test_cds machinery
hap_node_t root, node;
root.type = HAP_ROOT;
kstring_t *tref = &args->hap->tref, *tseq = &args->hap->tseq;
+ kstring_t *tref_stop = &args->hap->tref_stop, *tseq_stop = &args->hap->tseq_stop;
while ( regitr_overlap(args->itr) )
{
@@ -2553,12 +2848,12 @@ int test_cds_local(args_t *args, bcf1_t *rec)
if ( !TSCRIPT_AUX(tr) )
{
tr->aux = calloc(sizeof(tscript_t),1);
- tscript_init_ref(args, tr, chr);
+ tscript_init_ref(args, tr, chr_fai);
tscript_splice_ref(tr);
khp_insert(trhp, args->active_tr, &tr); // only to clean the reference afterwards
}
- sanity_check_ref(args, tr, rec);
+ if ( sanity_check_ref(args, tr, rec)<0 ) continue;
kstring_t sref;
sref.s = TSCRIPT_AUX(tr)->sref;
@@ -2594,40 +2889,44 @@ int test_cds_local(args_t *args, bcf1_t *rec)
sseq.s = node.seq;
int alen = sseq.l = strlen(sseq.s);
int fill = node.dlen%3 && alen ? 1 : 0; // see #1475227917
- cds_translate(&sref, &sseq, node.sbeg,node.sbeg,node.sbeg+node.rlen, tr->strand, tseq, fill);
+ cds_translate(&sref, &sseq, node.sbeg,node.sbeg,node.sbeg+node.rlen, tr->strand, tseq, tseq_stop, fill);
sseq.m = sref.m - 2*N_REF_PAD;
sseq.s = sref.s + N_REF_PAD + node.sbeg;
sseq.l = node.rlen;
- cds_translate(&sref, &sseq, node.sbeg,node.sbeg,node.sbeg+node.rlen, tr->strand, tref, fill);
+ cds_translate(&sref, &sseq, node.sbeg,node.sbeg,node.sbeg+node.rlen, tr->strand, tref, tref_stop, fill);
// check for truncating stops
- for (j=0; jl; j++)
- if ( tref->s[j]=='*' ) break;
- if ( j!=tref->l )
+ for (j=0; jl; j++)
+ if ( tref_stop->s[j]=='*' ) break;
+ if ( j!=tref_stop->l )
{
tref->l = j+1;
tref->s[j+1] = 0;
+ tref_stop->l = j+1;
+ tref_stop->s[j+1] = 0;
}
- for (j=0; jl; j++)
- if ( tseq->s[j]=='*' ) break;
+ for (j=0; jl; j++)
+ if ( tseq_stop->s[j]=='*' ) break;
if ( j!=tseq->l )
{
tseq->l = j+1;
tseq->s[j+1] = 0;
+ tseq_stop->l = j+1;
+ tseq_stop->s[j+1] = 0;
}
if ( csq_type & CSQ_STOP_LOST )
{
- if ( tref->s[tref->l-1]=='*' && tref->s[tref->l-1] == tseq->s[tseq->l-1] )
+ if ( tref_stop->s[tref_stop->l-1]=='*' && tref_stop->s[tref_stop->l-1] == tseq_stop->s[tseq_stop->l-1] )
{
csq_type &= ~CSQ_STOP_LOST;
csq_type |= CSQ_STOP_RETAINED;
}
- else if (tref->s[tref->l-1]!='*' )
+ else if (tref_stop->s[tref_stop->l-1]!='*' )
{
// This is CDS 3' incomplete ENSG00000173376/synon.vcf, can also be missense
// We observe in real data a change to a stop, ENST00000528237/retained-stop-incomplete-cds.vcf
- if ( tseq->s[tseq->l-1] == '*' )
+ if ( tseq_stop->s[tseq_stop->l-1] == '*' )
{
csq_type &= ~CSQ_STOP_GAINED;
csq_type |= CSQ_STOP_RETAINED;
@@ -2636,7 +2935,7 @@ int test_cds_local(args_t *args, bcf1_t *rec)
csq_type |= CSQ_INCOMPLETE_CDS;
}
}
- if ( csq_type & CSQ_START_LOST && tref->s[0]!='M' )
+ if ( csq_type & CSQ_START_LOST && tref_stop->s[0]!='M' )
csq_type &= ~CSQ_START_LOST;
if ( node.dlen!=0 )
{
@@ -2646,8 +2945,20 @@ int test_cds_local(args_t *args, bcf1_t *rec)
csq_type |= CSQ_INFRAME_DELETION;
else
csq_type |= CSQ_INFRAME_INSERTION;
- if ( tref->s[tref->l-1]!='*' && tseq->s[tseq->l-1]=='*' )
+ if ( tref_stop->s[tref_stop->l-1]!='*' && tseq_stop->s[tseq_stop->l-1]=='*' )
csq_type |= CSQ_STOP_GAINED;
+ if ( csq_type & CSQ_START_LOST && csq_type & CSQ_FRAMESHIFT_VARIANT )
+ {
+ // this is to prevent
+ // 4959 GA G start_lost|NBPF3|ENST00000318249|protein_coding|+
+ // 4959 GA G start_lost|NBPF3|ENST00000318249|protein_coding|+|1M>1?|4959GA>G
+ csq_type &= ~CSQ_FRAMESHIFT_VARIANT;
+ node.type = HAP_SSS;
+ csq_stage(args, &csq, rec);
+ free(node.seq);
+ free(node.var);
+ continue;
+ }
}
else
{
@@ -2656,9 +2967,9 @@ int test_cds_local(args_t *args, bcf1_t *rec)
{
if ( tref->s[j] == tseq->s[j] ) continue;
aa_change = 1;
- if ( tref->s[j] == '*' )
+ if ( tref_stop->s[j] == '*' )
csq_type |= CSQ_STOP_LOST;
- else if ( tseq->s[j] == '*' )
+ else if ( tseq_stop->s[j] == '*' )
csq_type |= CSQ_STOP_GAINED;
else
csq_type |= CSQ_MISSENSE_VARIANT;
@@ -2674,12 +2985,12 @@ int test_cds_local(args_t *args, bcf1_t *rec)
int aa_sbeg = tr->strand==STRAND_FWD ? node.sbeg/3+1 : (TSCRIPT_AUX(tr)->nsref - 2*N_REF_PAD + node.dlen - node.sbeg - alen)/3+1;
kputc_('|', &str);
kputw(aa_rbeg, &str);
- kprint_aa_prediction(args,aa_rbeg,tref,&str);
+ kprint_aa_prediction(args,aa_rbeg,tref,tref_stop,&str);
if ( !(csq_type & CSQ_SYNONYMOUS_VARIANT) )
{
kputc_('>', &str);
kputw(aa_sbeg, &str);
- kprint_aa_prediction(args,aa_sbeg,tseq,&str);
+ kprint_aa_prediction(args,aa_sbeg,tseq,tseq_stop,&str);
}
kputc_('|', &str);
kputw(rec->pos+1, &str);
@@ -2715,9 +3026,11 @@ int test_cds(args_t *args, bcf1_t *rec, vbuf_t *vbuf)
static int overlaps_warned = 0, multiploid_warned = 0;
int i, ret = 0, hap_ret;
- const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,rec));
+ const char *chr_vcf = bcf_seqname(args->hdr,rec);
+ const char *chr_gff = unify_chr_name(args, chr_vcf, CHR_VCF,CHR_GFF);
+ const char *chr_fai = unify_chr_name(args, chr_vcf, CHR_VCF,CHR_FAI);
// note that the off-by-one extension of rlen is deliberate to account for insertions
- if ( !regidx_overlap(args->idx_cds,chr,rec->pos,rec->pos+rec->rlen, args->itr) ) return 0;
+ if ( !regidx_overlap(args->idx_cds,chr_gff,rec->pos,rec->pos+rec->rlen, args->itr) ) return 0;
while ( regitr_overlap(args->itr) )
{
gf_cds_t *cds = regitr_payload(args->itr,gf_cds_t*);
@@ -2729,7 +3042,7 @@ int test_cds(args_t *args, bcf1_t *rec, vbuf_t *vbuf)
{
// initialize the transcript and its haplotype tree, fetch the reference sequence
tr->aux = calloc(sizeof(tscript_t),1);
- tscript_init_ref(args, tr, chr);
+ tscript_init_ref(args, tr, chr_fai);
TSCRIPT_AUX(tr)->root = (hap_node_t*) calloc(1,sizeof(hap_node_t));
TSCRIPT_AUX(tr)->nhap = args->phase==PHASE_DROP_GT ? 1 : 2*args->smpl->n; // maximum ploidy = diploid
@@ -2741,7 +3054,7 @@ int test_cds(args_t *args, bcf1_t *rec, vbuf_t *vbuf)
khp_insert(trhp, args->active_tr, &tr);
}
- sanity_check_ref(args, tr, rec);
+ if ( sanity_check_ref(args, tr, rec)<0 ) continue;
if ( args->phase==PHASE_DROP_GT )
{
@@ -2758,13 +3071,13 @@ int test_cds(args_t *args, bcf1_t *rec, vbuf_t *vbuf)
{
fprintf(stderr,
"Warning: Skipping overlapping variants at %s:%"PRId64"\t%s>%s.\n",
- chr,(int64_t) rec->pos+1,rec->d.allele[0],rec->d.allele[1]);
+ chr_vcf,(int64_t) rec->pos+1,rec->d.allele[0],rec->d.allele[1]);
if ( !overlaps_warned )
- fprintf(stderr," This message is printed only once, the verbosity can be increased with `--verbose 2`\n");
+ fprintf(stderr," This message is printed only once, the verbosity can be increased with `--verbosity 2`\n");
overlaps_warned = 1;
}
if ( args->out )
- fprintf(args->out,"LOG\tWarning: Skipping overlapping variants at %s:%"PRId64"\t%s>%s\n", chr,(int64_t) rec->pos+1,rec->d.allele[0],rec->d.allele[1]);
+ fprintf(args->out,"LOG\tWarning: Skipping overlapping variants at %s:%"PRId64"\t%s>%s\n", chr_vcf,(int64_t) rec->pos+1,rec->d.allele[0],rec->d.allele[1]);
}
else ret = 1; // prevent reporting as intron in test_tscript
hap_destroy(child);
@@ -2805,13 +3118,13 @@ int test_cds(args_t *args, bcf1_t *rec, vbuf_t *vbuf)
{
fprintf(stderr,
"Warning: Skipping site with non-diploid/non-haploid genotypes at %s:%"PRId64"\t%s>%s.\n",
- chr,(int64_t) rec->pos+1,rec->d.allele[0],rec->d.allele[1]);
+ chr_vcf,(int64_t) rec->pos+1,rec->d.allele[0],rec->d.allele[1]);
if ( !multiploid_warned )
- fprintf(stderr," This message is printed only once, the verbosity can be increased with `--verbose 2`\n");
+ fprintf(stderr," This message is printed only once, the verbosity can be increased with `--verbosity 2`\n");
multiploid_warned = 1;
}
if ( args->out )
- fprintf(args->out,"LOG\tWarning: Skipping site with non-diploid/non-haploid genotypes at %s:%"PRId64"\t%s>%s\n", chr,(int64_t) rec->pos+1,rec->d.allele[0],rec->d.allele[1]);
+ fprintf(args->out,"LOG\tWarning: Skipping site with non-diploid/non-haploid genotypes at %s:%"PRId64"\t%s>%s\n", chr_vcf,(int64_t) rec->pos+1,rec->d.allele[0],rec->d.allele[1]);
continue;
}
for (ismpl=0; ismplsmpl->n; ismpl++)
@@ -2828,7 +3141,7 @@ int test_cds(args_t *args, bcf1_t *rec, vbuf_t *vbuf)
if ( !bcf_gt_is_phased(gt[0]) && !bcf_gt_is_phased(gt[1]) )
{
if ( args->phase==PHASE_REQUIRE )
- error("Unphased heterozygous genotype at %s:%"PRId64", sample %s. See the --phase option.\n", chr,(int64_t) rec->pos+1,args->hdr->samples[args->smpl->idx[ismpl]]);
+ error("Unphased heterozygous genotype at %s:%"PRId64", sample %s. See the --phase option.\n", chr_vcf,(int64_t) rec->pos+1,args->hdr->samples[args->smpl->idx[ismpl]]);
if ( args->phase==PHASE_SKIP )
continue;
if ( args->phase==PHASE_NON_REF )
@@ -2871,14 +3184,14 @@ int test_cds(args_t *args, bcf1_t *rec, vbuf_t *vbuf)
{
fprintf(stderr,
"Warning: Skipping overlapping variants at %s:%"PRId64", sample %s\t%s>%s.\n",
- chr,(int64_t) rec->pos+1,args->hdr->samples[args->smpl->idx[ismpl]],rec->d.allele[0],rec->d.allele[ial]);
+ chr_vcf,(int64_t) rec->pos+1,args->hdr->samples[args->smpl->idx[ismpl]],rec->d.allele[0],rec->d.allele[ial]);
if ( !overlaps_warned )
- fprintf(stderr," This message is printed only once, the verbosity can be increased with `--verbose 2`\n");
+ fprintf(stderr," This message is printed only once, the verbosity can be increased with `--verbosity 2`\n");
overlaps_warned = 1;
}
if ( args->out )
fprintf(args->out,"LOG\tWarning: Skipping overlapping variants at %s:%"PRId64", sample %s\t%s>%s\n",
- chr,(int64_t) rec->pos+1,args->hdr->samples[args->smpl->idx[ismpl]],rec->d.allele[0],rec->d.allele[ial]);
+ chr_vcf,(int64_t) rec->pos+1,args->hdr->samples[args->smpl->idx[ismpl]],rec->d.allele[0],rec->d.allele[ial]);
}
hap_destroy(child);
continue;
@@ -2990,9 +3303,10 @@ void csq_stage(args_t *args, csq_t *csq, bcf1_t *rec)
}
int test_utr(args_t *args, bcf1_t *rec)
{
- const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,rec));
+ const char *chr_vcf = bcf_seqname(args->hdr,rec);
+ const char *chr_gff = unify_chr_name(args, chr_vcf, CHR_VCF,CHR_GFF);
// note that the off-by-one extension of rlen is deliberate to account for insertions
- if ( !regidx_overlap(args->idx_utr,chr,rec->pos,rec->pos+rec->rlen, args->itr) ) return 0;
+ if ( !regidx_overlap(args->idx_utr,chr_gff,rec->pos,rec->pos+rec->rlen, args->itr) ) return 0;
splice_t splice;
splice_init(&splice, rec);
@@ -3028,8 +3342,9 @@ int test_utr(args_t *args, bcf1_t *rec)
}
int test_splice(args_t *args, bcf1_t *rec)
{
- const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,rec));
- if ( !regidx_overlap(args->idx_exon,chr,rec->pos,rec->pos + rec->rlen, args->itr) ) return 0;
+ const char *chr_vcf = bcf_seqname(args->hdr,rec);
+ const char *chr_gff = unify_chr_name(args, chr_vcf, CHR_VCF,CHR_GFF);
+ if ( !regidx_overlap(args->idx_exon,chr_gff,rec->pos,rec->pos + rec->rlen, args->itr) ) return 0;
splice_t splice;
splice_init(&splice, rec);
@@ -3060,8 +3375,9 @@ int test_splice(args_t *args, bcf1_t *rec)
}
int test_tscript(args_t *args, bcf1_t *rec)
{
- const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,rec));
- if ( !regidx_overlap(args->idx_tscript,chr,rec->pos,rec->pos+rec->rlen, args->itr) ) return 0;
+ const char *chr_vcf = bcf_seqname(args->hdr,rec);
+ const char *chr_gff = unify_chr_name(args, chr_vcf, CHR_VCF,CHR_GFF);
+ if ( !regidx_overlap(args->idx_tscript,chr_gff,rec->pos,rec->pos+rec->rlen, args->itr) ) return 0;
splice_t splice;
splice_init(&splice, rec);
@@ -3103,7 +3419,8 @@ void test_symbolic_alt(args_t *args, bcf1_t *rec)
warned = 1;
}
- const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,rec));
+ const char *chr_vcf = bcf_seqname(args->hdr,rec);
+ const char *chr_gff = unify_chr_name(args, chr_vcf, CHR_VCF,CHR_GFF);
// only insertions atm
int beg = rec->pos + 1;
@@ -3111,7 +3428,7 @@ void test_symbolic_alt(args_t *args, bcf1_t *rec)
int csq_class = CSQ_ELONGATION;
int hit = 0;
- if ( regidx_overlap(args->idx_cds,chr,beg,end, args->itr) )
+ if ( regidx_overlap(args->idx_cds,chr_gff,beg,end, args->itr) )
{
while ( regitr_overlap(args->itr) )
{
@@ -3129,7 +3446,7 @@ void test_symbolic_alt(args_t *args, bcf1_t *rec)
hit = 1;
}
}
- if ( regidx_overlap(args->idx_utr,chr,beg,end, args->itr) )
+ if ( regidx_overlap(args->idx_utr,chr_gff,beg,end, args->itr) )
{
while ( regitr_overlap(args->itr) )
{
@@ -3147,7 +3464,7 @@ void test_symbolic_alt(args_t *args, bcf1_t *rec)
hit = 1;
}
}
- if ( regidx_overlap(args->idx_exon,chr,beg,end, args->itr) )
+ if ( regidx_overlap(args->idx_exon,chr_gff,beg,end, args->itr) )
{
splice_t splice;
splice_init(&splice, rec);
@@ -3166,7 +3483,7 @@ void test_symbolic_alt(args_t *args, bcf1_t *rec)
if ( splice.csq ) hit = 1;
}
}
- if ( !hit && regidx_overlap(args->idx_tscript,chr,beg,end, args->itr) )
+ if ( !hit && regidx_overlap(args->idx_tscript,chr_gff,beg,end, args->itr) )
{
splice_t splice;
splice_init(&splice, rec);
@@ -3227,6 +3544,7 @@ static void process(args_t *args, bcf1_t **rec_ptr)
bcf1_t *rec = *rec_ptr;
static int32_t prev_rid = -1, prev_pos = -1;
+ const char *chr_vcf = bcf_seqname(args->hdr,rec);
if ( prev_rid!=rec->rid )
{
prev_rid = rec->rid;
@@ -3235,14 +3553,28 @@ static void process(args_t *args, bcf1_t **rec_ptr)
// Common error is to use different naming conventions in the fasta and the VCF (e.g. X vs chrX).
// Perform a simple sanity check (that does not catch much), the chromosome must be present in the
// reference file
- if ( !faidx_has_seq(args->fai,bcf_seqname(args->hdr,rec)) )
+ const char *chr_fai = unify_chr_name(args, chr_vcf, CHR_VCF,CHR_FAI);
+ if ( !faidx_has_seq(args->fai,chr_fai) )
{
- if ( !faidx_has_seq(args->fai,drop_chr_prefix(args,bcf_seqname(args->hdr,rec))) && !faidx_has_seq(args->fai,add_chr_prefix(args,bcf_seqname(args->hdr,rec))) )
- error("Error: the chromosome \"%s\" is not present in %s\n",bcf_seqname(args->hdr,rec),args->fa_fname);
+ static int missing_chr_fai_warned = 0;
+ if ( !args->force )
+ error("Error: the chromosome \"%s\" is not present in %s\n %s\n",chr_fai,args->fa_fname,args->unify_chr_names_err);
+ else if ( !missing_chr_fai_warned++ )
+ fprintf(stderr,"Warning: the chromosome \"%s\" is not present in %s. This warning is printed only once.\n",chr_fai,args->fa_fname);
+ }
+
+ const char *chr_gff = unify_chr_name(args, chr_vcf, CHR_VCF,CHR_GFF);
+ if ( !gff_has_seq(args->gff,chr_gff) )
+ {
+ static int missing_chr_gff_warned = 0;
+ if ( !args->force )
+ error("Error: the chromosome \"%s\" is not present in %s\n %s\n",chr_gff,args->gff_fname,args->unify_chr_names_err);
+ else if ( !missing_chr_gff_warned++ )
+ fprintf(stderr,"Warning: the chromosome \"%s\" is not present in %s. This warning is printed only once.\n",chr_gff,args->gff_fname);
}
}
if ( prev_pos > rec->pos )
- error("Error: The file is not sorted, %s:%d comes before %s:%"PRId64"\n",bcf_seqname(args->hdr,rec),prev_pos+1,bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1);
+ error("Error: The file is not sorted, %s:%d comes before %s:%"PRId64"\n",chr_vcf,prev_pos+1,bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1);
int call_csq = 1;
if ( rec->n_allele < 2 ) call_csq = 0; // no alternate allele
@@ -3305,6 +3637,7 @@ static const char *usage(void)
"\n"
"CSQ options:\n"
" -B, --trim-protein-seq INT Abbreviate protein-changing predictions to max INT aminoacids\n"
+ " -C, --genetic-code INT|l Specify the genetic code table to use, 'l' to print a list [0]\n"
" -c, --custom-tag STRING Use this tag instead of the default BCSQ\n"
" -l, --local-csq Localized predictions, consider only one VCF record at a time\n"
" -n, --ncsq INT Maximum number of per-haplotype consequences to consider for each site [15]\n"
@@ -3317,7 +3650,8 @@ static const char *usage(void)
"GFF options:\n"
" --dump-gff FILE.gz Dump the parsed GFF file (for debugging purposes)\n"
" --force Run even if some sanity checks fail\n"
- " --unify-chr-names 1|0 Automatically unify chromosome naming (e.g. chrX vs X) in GFF, fasta, and VCF [1]\n"
+ " --unify-chr-names 0|LIST Unify chromosome naming by stripping a prefix in VCF,GFF,fasta, respectively [0]\n"
+ " (e.g., \"chr,Chr,-\" trims \"chr\" in VCF and \"Chr\" in GFF, fasta is unchanged)\n"
"General options:\n"
" -e, --exclude EXPR Exclude sites for which the expression is true\n"
" -i, --include EXPR Select sites for which the expression is true\n"
@@ -3334,7 +3668,7 @@ static const char *usage(void)
" -T, --targets-file FILE Similar to -R but streams rather than index-jumps\n"
" --targets-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n"
" --threads INT Use multithreading with worker threads [0]\n"
- " -v, --verbose INT Verbosity level 0-2 [1]\n"
+ " -v, --verbosity INT Verbosity level 0-6 [1]\n"
" -W, --write-index[=FMT] Automatically index the output files [off]\n"
"\n"
"Example:\n"
@@ -3356,11 +3690,11 @@ int main_csq(int argc, char *argv[])
args->verbosity = 1;
args->record_cmd_line = 1;
args->clevel = -1;
- args->unify_chr_names = 1;
static struct option loptions[] =
{
{"force",0,0,1},
+ {"genetic-code",required_argument,NULL,'C'},
{"threads",required_argument,NULL,2},
{"help",0,0,'h'},
{"ncsq",1,0,'n'},
@@ -3377,6 +3711,7 @@ int main_csq(int argc, char *argv[])
{"phase",1,0,'p'},
{"quiet",0,0,'q'},
{"verbose",1,0,'v'},
+ {"verbosity",1,0,'v'},
{"regions",1,0,'r'},
{"regions-file",1,0,'R'},
{"regions-overlap",required_argument,NULL,4},
@@ -3395,7 +3730,7 @@ int main_csq(int argc, char *argv[])
int regions_overlap = 1;
int targets_overlap = 0;
char *targets_list = NULL, *regions_list = NULL, *tmp;
- while ((c = getopt_long(argc, argv, "?hr:R:t:T:i:e:f:o:O:g:s:S:p:qc:ln:bB:v:W::",loptions,NULL)) >= 0)
+ while ((c = getopt_long(argc, argv, "?hr:R:t:T:i:e:f:o:O:g:s:S:p:qc:C:ln:bB:v:W::",loptions,NULL)) >= 0)
{
switch (c)
{
@@ -3414,11 +3749,13 @@ int main_csq(int argc, char *argv[])
if ( *tmp || args->brief_predictions<1 ) error("Could not parse argument: --trim-protein-seq %s\n", optarg);
break;
case 'l': args->local_csq = 1; break;
+ case 'C': args->gencode_str = optarg; break;
case 'c': args->bcsq_tag = optarg; break;
case 'q': error("Error: the -q option has been deprecated, use -v, --verbose instead.\n"); break;
case 'v':
args->verbosity = atoi(optarg);
- if ( args->verbosity<0 || args->verbosity>2 ) error("Error: expected integer 0-2 with -v, --verbose\n");
+ if ( args->verbosity<0 ) error("Error: expected integer with -v, --verbosity\n");
+ if ( args->verbosity > 3 ) hts_verbose = args->verbosity;
break;
case 'p':
switch (optarg[0])
@@ -3482,16 +3819,14 @@ int main_csq(int argc, char *argv[])
error("Unsupported index format '%s'\n", optarg);
break;
case 7 : args->dump_gff = optarg; break;
- case 8 :
- if ( !strcmp(optarg,"0") ) args->unify_chr_names = 0;
- else if ( !strcmp(optarg,"1") ) args->unify_chr_names = 1;
- else error("Could not parse: --unify-chr-names %s\n",optarg);
- break;
+ case 8 : args->unify_chr_names = optarg; break;
case 'h':
case '?': error("%s",usage());
default: error("The option not recognised: %s\n\n", optarg); break;
}
}
+ init_gencode(args);
+
char *fname = NULL;
if ( optind==argc )
{
diff --git a/bcftools/csq.c.pysam.c b/bcftools/csq.c.pysam.c
index 3f482fdf..dfe1e890 100644
--- a/bcftools/csq.c.pysam.c
+++ b/bcftools/csq.c.pysam.c
@@ -2,7 +2,7 @@
/* The MIT License
- Copyright (c) 2016-2024 Genome Research Ltd.
+ Copyright (c) 2016-2025 Genome Research Ltd.
Author: Petr Danecek
@@ -220,6 +220,10 @@
#define CSQ_PRN_NMD (~(CSQ_INTRON|CSQ_NON_CODING))
#define CSQ_PRN_BIOTYPE CSQ_NON_CODING
+#define CHR_VCF 0
+#define CHR_GFF 1
+#define CHR_FAI 2
+
// see kput_vcsq()
const char *csq_strings[] =
{
@@ -369,15 +373,24 @@ typedef struct
{
int mstack;
hstack_t *stack;
- gf_tscript_t *tr; // tr->ref: spliced transcript on ref strand
- kstring_t sseq; // spliced haplotype sequence on ref strand
- kstring_t tseq; // the variable part of translated haplotype transcript, coding strand
- kstring_t tref; // the variable part of translated reference transcript, coding strand
- uint32_t sbeg; // stack's sbeg, for cases first node's type is HAP_SSS
+ gf_tscript_t *tr; // tr->ref: spliced transcript on ref strand
+ kstring_t sseq; // spliced haplotype sequence on ref strand
+ kstring_t tseq; // the variable part of translated haplotype transcript, coding strand
+ kstring_t tref; // the variable part of translated reference transcript, coding strand
+ kstring_t tseq_stop; // the stop/start codons in tseq and tref
+ kstring_t tref_stop; //
+ uint32_t sbeg; // stack's sbeg, for cases first node's type is HAP_SSS
int upstream_stop;
}
hap_t;
+typedef struct
+{
+ int id;
+ const char *name, *code, *stop;
+}
+gencode_t;
+
typedef struct _args_t
{
// the main regidx lookups, from chr:beg-end to overlapping features and
@@ -415,14 +428,18 @@ typedef struct _args_t
int ncsq2_max, nfmt_bcsq; // maximum number of csq per site that can be accessed from FORMAT/BCSQ (*2 and 1 bit skipped to avoid BCF missing values)
int ncsq2_small_warned;
int brief_predictions;
- int unify_chr_names;
- char *chr_name;
+ char *unify_chr_names; // e.g. chr,Chromosome,-; prefixes in VCF,GFF,fasta
+ char *unify_chr_names_err;
+ char *chr_prefix[3]; // chr prefix to trim in VCF,GFF,fasta. See also CHR_VCF,CHR_GFF,CHR_FAI
+ char *chr_name, *chr_names[3];
int mchr_name;
struct {
int unknown_chr,unknown_tscript_biotype,unknown_strand,unknown_phase,duplicate_id;
- int unknown_cds_phase,incomplete_cds,wrong_phase,overlapping_cds;
+ int unknown_cds_phase,incomplete_cds,wrong_phase,overlapping_cds,ref_allele_mismatch;
} warned;
+ char *gencode_str; // which genetic code table to use
+ gencode_t *gencode; // genetic code table
int rid; // current chromosome
tr_heap_t *active_tr; // heap of active transcripts for quick flushing
hap_t *hap; // transcript haplotype recursion
@@ -442,8 +459,94 @@ typedef struct _args_t
}
args_t;
+// Generated with misc/gencode-tables
// AAA, AAC, ...
-const char *gencode = "KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSSS*CWCLFLF";
+gencode_t gencode_tables[] =
+{
+ {.id=0, .name="Standard sipmlified",
+ .code="KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSSS*CWCLFLF",
+ .stop="--------------M---------------------------------*-*-----*-------" },
+ {.id=1, .name="Standard",
+ .code="KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSSS*CWCLFLF",
+ .stop="--------------M---------------M-----------------*-*-----*-----M-" },
+ {.id=2, .name="Vertebrate Mitochondrial",
+ .code="KNKNTTTT*S*SMIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSSSWCWCLFLF",
+ .stop="--------*-*-MMMM------------------------------M-*-*-------------" },
+ {.id=3, .name="Yeast Mitochondrial",
+ .code="KNKNTTTTRSRSMIMIQHQHPPPPRRRRTTTTEDEDAAAAGGGGVVVV*Y*YSSSSWCWCLFLF",
+ .stop="------------M-M-------------------------------M-*-*-------------" },
+ {.id=4, .name="Mold Mitochondrial; Protozoan Mitochondrial; Coelenterate Mitochondrial; Mycoplasma; Spiroplasma",
+ .code="KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSSSWCWCLFLF",
+ .stop="------------MMMM--------------M---------------M-*-*---------M-M-" },
+ {.id=5, .name="Invertebrate Mitochondrial",
+ .code="KNKNTTTTSSSSMIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSSSWCWCLFLF",
+ .stop="------------MMMM------------------------------M-*-*-----------M-" },
+ {.id=6, .name="Ciliate Nuclear; Dasycladacean Nuclear; Hexamita Nuclear",
+ .code="KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVVQYQYSSSS*CWCLFLF",
+ .stop="--------------M-----------------------------------------*-------" },
+ {.id=9, .name="Echinoderm Mitochondrial; Flatworm Mitochondrial",
+ .code="NNKNTTTTSSSSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSSSWCWCLFLF",
+ .stop="--------------M-------------------------------M-*-*-------------" },
+ {.id=10, .name="Euplotid Nuclear",
+ .code="KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSSSCCWCLFLF",
+ .stop="--------------M---------------------------------*-*-------------" },
+ {.id=11, .name="Bacterial, Archaeal and Plant Plastid",
+ .code="KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSSS*CWCLFLF",
+ .stop="------------MMMM--------------M---------------M-*-*-----*-----M-" },
+ {.id=12, .name="Alternative Yeast Nuclear",
+ .code="KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLSLEDEDAAAAGGGGVVVV*Y*YSSSS*CWCLFLF",
+ .stop="--------------M---------------M-----------------*-*-----*-------" },
+ {.id=13, .name="Ascidian Mitochondrial",
+ .code="KNKNTTTTGSGSMIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSSSWCWCLFLF",
+ .stop="------------M-M-------------------------------M-*-*-----------M-" },
+ {.id=14, .name="Alternative Flatworm Mitochondrial",
+ .code="NNKNTTTTSSSSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVVYY*YSSSSWCWCLFLF",
+ .stop="--------------M-----------------------------------*-------------" },
+ {.id=15, .name="Blepharisma Nuclear",
+ .code="KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*YQYSSSS*CWCLFLF",
+ .stop="--------------M---------------------------------*-------*-------" },
+ {.id=16, .name="Chlorophycean Mitochondrial",
+ .code="KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*YLYSSSS*CWCLFLF",
+ .stop="--------------M---------------------------------*-------*-------" },
+ {.id=21, .name="Trematode Mitochondrial",
+ .code="NNKNTTTTSSSSMIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSSSWCWCLFLF",
+ .stop="--------------M-------------------------------M-*-*-------------" },
+ {.id=22, .name="Scenedesmus obliquus Mitochondrial Code",
+ .code="KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*YLY*SSS*CWCLFLF",
+ .stop="--------------M---------------------------------*---*---*-------" },
+ {.id=23, .name="Thraustochytrium mitochondrial code",
+ .code="KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSSS*CWC*FLF",
+ .stop="--------------MM------------------------------M-*-*-----*---*---" },
+ {.id=24, .name="Pterobranchia Mitochondrial",
+ .code="KNKNTTTTSSKSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSSSWCWCLFLF",
+ .stop="--------------M---------------M---------------M-*-*-----------M-" },
+ {.id=25, .name="Candidate Division SR1 and Gracilibacteria",
+ .code="KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSSSGCWCLFLF",
+ .stop="--------------M-------------------------------M-*-*-----------M-" },
+ {.id=26, .name="Pachysolen tannophilus Nuclear Code",
+ .code="KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLALEDEDAAAAGGGGVVVV*Y*YSSSS*CWCLFLF",
+ .stop="--------------M---------------M-----------------*-*-----*-------" },
+ {.id=27, .name="Karyorelict Nuclear",
+ .code="KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVVQYQYSSSSWCWCLFLF",
+ .stop="--------------M-----------------------------------------*-------" },
+ {.id=28, .name="Condylostoma Nuclear",
+ .code="KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVVQYQYSSSSWCWCLFLF",
+ .stop="--------------M---------------------------------*-*-----*-------" },
+ {.id=29, .name="Mesodinium Nuclear",
+ .code="KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVVYYYYSSSS*CWCLFLF",
+ .stop="--------------M-----------------------------------------*-------" },
+ {.id=30, .name="Peritrich Nuclear",
+ .code="KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVVEYEYSSSS*CWCLFLF",
+ .stop="--------------M-----------------------------------------*-------" },
+ {.id=31, .name="Blastocrithidia Nuclear",
+ .code="KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVVEYEYSSSSWCWCLFLF",
+ .stop="--------------M---------------------------------*-*-------------" },
+ {.id=33, .name="Cephalodiscidae Mitochondrial UAA-Tyr",
+ .code="KNKNTTTTSSKSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVVYY*YSSSSWCWCLFLF",
+ .stop="--------------M---------------M---------------M---*-----------M-" },
+ {.id=-1, .name=NULL, .code=NULL, .stop=NULL}
+};
+gencode_t *gencode = NULL;
const uint8_t nt4[] =
{
4,4,4,4, 4,4,4,4, 4,4,4,4, 4,4,4,4,
@@ -466,8 +569,10 @@ const uint8_t cnt4[] =
4,3,4,2, 4,4,4,1, 4,4,4,4, 4,4,4,4,
4,4,4,4, 0
};
-#define dna2aa(x) gencode[ nt4[(uint8_t)(x)[0]]<<4 | nt4[(uint8_t)(x)[1]]<<2 | nt4[(uint8_t)(x)[2]] ]
-#define cdna2aa(x) gencode[ cnt4[(uint8_t)(x)[2]]<<4 | cnt4[(uint8_t)(x)[1]]<<2 | cnt4[(uint8_t)(x)[0]] ]
+#define dna2aa(x) gencode->code[ nt4[(uint8_t)(x)[0]]<<4 | nt4[(uint8_t)(x)[1]]<<2 | nt4[(uint8_t)(x)[2]] ]
+#define cdna2aa(x) gencode->code[ cnt4[(uint8_t)(x)[2]]<<4 | cnt4[(uint8_t)(x)[1]]<<2 | cnt4[(uint8_t)(x)[0]] ]
+#define dna2stop(x) gencode->stop[ nt4[(uint8_t)(x)[0]]<<4 | nt4[(uint8_t)(x)[1]]<<2 | nt4[(uint8_t)(x)[2]] ]
+#define cdna2stop(x) gencode->stop[ cnt4[(uint8_t)(x)[2]]<<4 | cnt4[(uint8_t)(x)[1]]<<2 | cnt4[(uint8_t)(x)[0]] ]
static inline int ncsq2_to_nfmt(int ncsq2)
{
@@ -479,6 +584,111 @@ static inline void icsq2_to_bit(int icsq2, int *ival, int *ibit)
*ibit = icsq2 % 30;
}
+static void init_gencode(args_t *args)
+{
+ int i,j,k;
+ if ( !args->gencode_str ) args->gencode_str = "0";
+ if ( !strcasecmp("l",args->gencode_str) )
+ {
+ fprintf(bcftools_stdout, "# The tables are ordered by codon as AAA,AAC,AAG,...,ACA,ACC,...,TTT:\n");
+ fprintf(bcftools_stdout, "#\n#\t");
+ for (i=0; i<4; i++)
+ {
+ for (j=0; j<16; j++) fprintf(bcftools_stdout, "%c","ACGT"[i]);
+ }
+ fprintf(bcftools_stdout, "\n#\t");
+ for (i=0; i<4; i++)
+ {
+ for (j=0; j<4; j++)
+ {
+ for (k=0; k<4; k++) fprintf(bcftools_stdout, "%c","ACGT"[j]);
+ }
+ }
+ fprintf(bcftools_stdout, "\n#\t");
+ for (i=0; i<16; i++)
+ {
+ for (k=0; k<4; k++) fprintf(bcftools_stdout, "%c","ACGT"[k]);
+ }
+ fprintf(bcftools_stdout, "\n#\n\n");
+
+ for (i=0; gencode_tables[i].id >= 0; i++)
+ {
+ gencode_t *gc = &gencode_tables[i];
+ fprintf(bcftools_stdout, "%d\t%s\n\t%s\n\t%s\n\n",gc->id,gc->name,gc->code,gc->stop);
+ }
+ bcftools_exit(0);
+ }
+ char *tmp;
+ int id = strtol(args->gencode_str,&tmp,10);
+ if ( *tmp ) error("Could not parse argument: --genetic-code %s\n",args->gencode_str);
+ for (i=0; gencode_tables[i].id >= 0; i++)
+ {
+ gencode_t *gc = &gencode_tables[i];
+ if ( gc->id==id )
+ {
+ gencode = gc;
+ break;
+ }
+ }
+ if ( !gencode ) error("Could not parse argument: --genetic-code %s, no such table\n",args->gencode_str);
+}
+void test_prefix(kstring_t *str, const char *seq)
+{
+ if ( !strncasecmp(seq,"chromosome_",11) ) kputsn(seq,11,str);
+ else if ( !strncasecmp(seq,"chromosome",10) ) kputsn(seq,10,str);
+ else if ( !strncasecmp(seq,"chrom_",6) ) kputsn(seq,6,str);
+ else if ( !strncasecmp(seq,"chrom",5) ) kputsn(seq,5,str);
+ else if ( !strncasecmp(seq,"chr_",4) ) kputsn(seq,4,str);
+ else if ( !strncasecmp(seq,"chr",3) ) kputsn(seq,3,str);
+ else kputs("-",str);
+}
+void init_chr_names(args_t *args)
+{
+ // init chr prefixes to trim
+ int i,n;
+ char **tmp;
+
+ // chr prefixes given explicitly
+ args->unify_chr_names_err = strdup("check if --unify-chr-names or --force could help");
+ if ( args->unify_chr_names && (tmp=hts_readlist(args->unify_chr_names,0,&n)) )
+ {
+ if ( n!=3 ) error("Error: expected three strings, got --unify-chr-names %s\n",args->unify_chr_names);
+ for (i=0; i<3; i++)
+ if ( strcmp("-",tmp[i]) ) args->chr_prefix[i] = tmp[i];
+ else free(tmp[i]);
+ free(tmp);
+ return;
+ }
+
+ int nseq;
+ const char **vcf = bcf_hdr_seqnames(args->hdr, &nseq);
+ if ( !vcf ) return;
+ const char *seq_vcf = vcf[0];
+ const char *seq_gff = gff_iseq(args->gff,0);
+ const char *seq_fa = faidx_iseq(args->fai,0);
+ free(vcf);
+ if ( !strcmp(seq_vcf,seq_fa) && !strcmp(seq_vcf,seq_gff) ) return;
+
+ // First sequences not identical: either they have different prefix or they are in different order.
+ // See if we can suggest the --unify-chr-names parameter to use
+ kstring_t chr_vcf = {0,0,0}, chr_gff = {0,0,0}, chr_fa = {0,0,0}, str = {0,0,0};
+ test_prefix(&chr_vcf, seq_vcf);
+ test_prefix(&chr_gff, seq_gff);
+ test_prefix(&chr_fa, seq_fa);
+ int same_chr = 1;
+ if ( strcmp(!strcmp("-",chr_vcf.s)?seq_vcf:seq_vcf+chr_vcf.l,!strcmp("-",chr_gff.s)?seq_gff:seq_gff+chr_gff.l) ) same_chr = 0;
+ if ( strcmp(!strcmp("-",chr_gff.s)?seq_gff:seq_gff+chr_gff.l,!strcmp("-",chr_fa.s)?seq_fa:seq_fa+chr_fa.l) ) same_chr = 0;
+ if ( strcmp(!strcmp("-",chr_fa.s)?seq_fa:seq_fa+chr_fa.l,!strcmp("-",chr_vcf.s)?seq_vcf:seq_vcf+chr_vcf.l) ) same_chr = 0;
+ free(args->unify_chr_names_err);
+ if ( same_chr )
+ ksprintf(&str,"the first sequence name in VCF/GFF/fasta is %s/%s/%s, try to run with --unify-chr-names %s,%s,%s\n",seq_vcf,seq_gff,seq_fa,chr_vcf.s,chr_gff.s,chr_fa.s);
+ else
+ ksprintf(&str,"the first sequence name in VCF/GFF/fasta is %s/%s/%s, check if running with --unify-chr-names or --force coud help\n",seq_vcf,seq_gff,seq_fa);
+ free(chr_vcf.s);
+ free(chr_gff.s);
+ free(chr_fa.s);
+ args->unify_chr_names_err = str.s;
+}
void init_data(args_t *args)
{
args->nfmt_bcsq = ncsq2_to_nfmt(args->ncsq2_max);
@@ -488,7 +698,6 @@ void init_data(args_t *args)
args->gff = gff_init(args->gff_fname);
gff_set(args->gff,verbosity,args->verbosity);
- gff_set(args->gff,strip_chr_names,args->unify_chr_names);
gff_set(args->gff,force_out_of_phase,args->force);
gff_set(args->gff,dump_fname,args->dump_gff);
gff_parse(args->gff);
@@ -498,6 +707,8 @@ void init_data(args_t *args)
args->idx_tscript = gff_get(args->gff,idx_tscript);
args->itr = regitr_init(NULL);
+ init_chr_names(args);
+
args->rid = -1;
if ( args->filter_str )
@@ -581,6 +792,13 @@ void destroy_data(args_t *args)
kh_destroy(pos2vbuf,args->pos2vbuf);
if ( args->smpl ) smpl_ilist_destroy(args->smpl);
int i,j,ret;
+ for (i=0; i<3; i++)
+ {
+ free(args->chr_prefix[i]);
+ free(args->chr_names[i]);
+ }
+ free(args->chr_name);
+
if ( args->out_fh )
{
if ( args->write_index )
@@ -619,12 +837,14 @@ void destroy_data(args_t *args)
free(args->hap->sseq.s);
free(args->hap->tseq.s);
free(args->hap->tref.s);
+ free(args->hap->tseq_stop.s);
+ free(args->hap->tref_stop.s);
free(args->hap);
fai_destroy(args->fai);
free(args->gt_arr);
free(args->str.s);
free(args->str2.s);
- free(args->chr_name);
+ free(args->unify_chr_names_err);
}
/*
@@ -668,6 +888,7 @@ void splice_init(splice_t *splice, bcf1_t *rec)
}
static inline void splice_build_hap(splice_t *splice, uint32_t beg, int len)
{
+ // beg .. the beggining of the splice region
// len>0 .. beg is the first base, del filled from right
// len<0 .. beg is the last base, del filled from left
@@ -683,8 +904,24 @@ static inline void splice_build_hap(splice_t *splice, uint32_t beg, int len)
}
else
{
- rbeg = abeg = beg;
- rlen = alen = len;
+ if ( beg < splice->tr->beg )
+ {
+ // This can happen with very short exons and introns. Not a real biology, but the program
+ // should not crash on it. This is not a real fix, the code would need a revamp to handle
+ // well cases like this, see test/csq/ENSCAFT00000047742
+ // >chr9:104-110
+ // ATGTCAGGGCC
+ // ATGTC-GGGCC
+ // 456
+ // eee.eee
+ rbeg = abeg = splice->tr->beg;
+ rlen = alen = 0;
+ }
+ else
+ {
+ rbeg = abeg = beg;
+ rlen = alen = len;
+ }
// check for incomplete del as above??
}
@@ -810,20 +1047,32 @@ fprintf(bcftools_stderr,"csq_stage_splice %d: type=%d\n",(int)rec->pos+1,type);
csq.type.gene = tr->gene->name;
csq_stage(args, &csq, rec);
}
-static inline const char *drop_chr_prefix(args_t *args, const char *chr)
+static inline const char *unify_chr_name(args_t *args, const char *chr, int isrc, int idst)
{
- if ( !args->unify_chr_names ) return chr;
- if ( !strncasecmp("chr",chr,3) ) return chr+3;
- return chr;
-}
-static inline const char *add_chr_prefix(args_t *args, const char *chr)
-{
- if ( !args->unify_chr_names ) return chr;
- int len = strlen(chr);
- hts_expand(char,len+4,args->mchr_name,args->chr_name);
- memcpy(args->chr_name,"chr",3);
- memcpy(args->chr_name+3,chr,len+1);
- return args->chr_name;
+ if ( !args->chr_prefix[isrc] && !args->chr_prefix[idst] ) return chr;
+
+ int off = 0, len = strlen(chr);
+ if ( args->chr_prefix[isrc] )
+ {
+ off = strlen(args->chr_prefix[isrc]);
+ len -= off;
+ if ( strncmp(args->chr_prefix[isrc],chr,off) )
+ error("Error: failed to unify chr names, cannot strip \"%s\" from \"%s\"\n",args->chr_prefix[isrc],chr);
+ }
+ hts_expand(char,len+1,args->mchr_name,args->chr_name);
+ memcpy(args->chr_name,chr+off,len+1);
+
+ if ( args->chr_prefix[idst] )
+ {
+ off = strlen(args->chr_prefix[idst]);
+ hts_expand(char,len+off+1,args->mchr_name,args->chr_name);
+ memmove(args->chr_name+off,args->chr_name,len+1);
+ memcpy(args->chr_name,args->chr_prefix[idst],off);
+ }
+
+ free(args->chr_names[idst]);
+ args->chr_names[idst] = strdup(args->chr_name);
+ return args->chr_names[idst];
}
static inline int splice_csq_ins(args_t *args, splice_t *splice, uint32_t ex_beg, uint32_t ex_end)
{
@@ -850,7 +1099,7 @@ fprintf(bcftools_stderr,"ins: %s>%s .. ex=%d,%d beg,end=%d,%d tbeg,tend=%d,%d
if ( splice->check_utr )
{
regitr_t *itr = regitr_init(NULL);
- const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,splice->vcf.rec));
+ const char *chr = unify_chr_name(args, bcf_seqname(args->hdr,splice->vcf.rec),CHR_VCF,CHR_GFF);
if ( regidx_overlap(args->idx_utr,chr,splice->ref_beg+1,splice->ref_beg+1, itr) ) // adjacent utr
{
ret = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq, splice->vcf.ial);
@@ -888,7 +1137,7 @@ fprintf(bcftools_stderr,"ins: %s>%s .. ex=%d,%d beg,end=%d,%d tbeg,tend=%d,%d
if ( splice->check_utr )
{
regitr_t *itr = regitr_init(NULL);
- const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,splice->vcf.rec));
+ const char *chr = unify_chr_name(args, bcf_seqname(args->hdr,splice->vcf.rec),CHR_VCF,CHR_GFF);
if ( regidx_overlap(args->idx_utr,chr,splice->ref_end-1,splice->ref_end-1, itr) ) // adjacent utr
{
ret = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq, splice->vcf.ial);
@@ -1067,7 +1316,7 @@ fprintf(bcftools_stderr,"splice_csq_del: %s>%s .. ex=%d,%d beg,end=%d,%d tbeg,
if ( splice->check_utr )
{
regitr_t *itr = regitr_init(NULL);
- const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,splice->vcf.rec));
+ const char *chr = unify_chr_name(args, bcf_seqname(args->hdr,splice->vcf.rec),CHR_VCF,CHR_GFF);
if ( regidx_overlap(args->idx_utr,chr,splice->ref_beg,ex_beg-1, itr) ) // adjacent utr
csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq, splice->vcf.ial);
regitr_destroy(itr);
@@ -1095,7 +1344,9 @@ fprintf(bcftools_stderr,"splice_csq_del: %s>%s .. ex=%d,%d beg,end=%d,%d tbeg,
{
if ( splice->check_donor && splice->tr->strand==STRAND_REV ) splice->csq |= CSQ_SPLICE_DONOR;
if ( splice->check_acceptor && splice->tr->strand==STRAND_FWD ) splice->csq |= CSQ_SPLICE_ACCEPTOR;
- if ( ref && alt && !strncmp(ref+N_SPLICE_REGION_INTRON-N_SPLICE_DONOR,alt+N_SPLICE_REGION_INTRON-N_SPLICE_DONOR,N_SPLICE_DONOR) ) splice->csq |= CSQ_SYNONYMOUS_VARIANT;
+
+ int noff = N_SPLICE_REGION_INTRON - N_SPLICE_DONOR;
+ if ( ref && alt && noffkref.l && noffkalt.l && !strncmp(ref+noff,alt+noff,N_SPLICE_DONOR) ) splice->csq |= CSQ_SYNONYMOUS_VARIANT;
}
}
}
@@ -1123,7 +1374,7 @@ fprintf(bcftools_stderr,"splice_csq_del: %s>%s .. ex=%d,%d beg,end=%d,%d tbeg,
if ( splice->check_utr )
{
regitr_t *itr = regitr_init(NULL);
- const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,splice->vcf.rec));
+ const char *chr = unify_chr_name(args, bcf_seqname(args->hdr,splice->vcf.rec),CHR_VCF,CHR_GFF);
if ( regidx_overlap(args->idx_utr,chr,ex_end+1,splice->ref_end, itr) ) // adjacent utr
csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq, splice->vcf.ial);
regitr_destroy(itr);
@@ -1214,7 +1465,7 @@ fprintf(bcftools_stderr,"mnp: %s>%s .. ex=%d,%d beg,end=%d,%d tbeg,tend=%d,%d
if ( splice->check_utr )
{
regitr_t *itr = regitr_init(NULL);
- const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,splice->vcf.rec));
+ const char *chr = unify_chr_name(args, bcf_seqname(args->hdr,splice->vcf.rec),CHR_VCF,CHR_GFF);
if ( regidx_overlap(args->idx_utr,chr,splice->ref_beg,ex_beg-1, itr) ) // adjacent utr
csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq, splice->vcf.ial);
regitr_destroy(itr);
@@ -1244,7 +1495,7 @@ fprintf(bcftools_stderr,"mnp: %s>%s .. ex=%d,%d beg,end=%d,%d tbeg,tend=%d,%d
if ( splice->check_utr )
{
regitr_t *itr = regitr_init(NULL);
- const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,splice->vcf.rec));
+ const char *chr = unify_chr_name(args, bcf_seqname(args->hdr,splice->vcf.rec),CHR_VCF,CHR_GFF);
if ( regidx_overlap(args->idx_utr,chr,ex_end+1,splice->ref_end, itr) ) // adjacent utr
csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq, splice->vcf.ial);
regitr_destroy(itr);
@@ -1350,10 +1601,12 @@ int hap_init(args_t *args, hap_node_t *parent, hap_node_t *child, gf_cds_t *cds,
if ( tr->strand==STRAND_FWD ) { if ( child->icds==tr->ncds-1 ) splice.check_stop = 1; }
else if ( tr->strand==STRAND_REV ) { if ( child->icds==0 ) splice.check_stop = 1; }
}
- if ( splice.check_start ) // do not check starts in incomplete CDS, defined as not starting with M
+ if ( splice.check_start )
{
- if ( tr->strand==STRAND_FWD ) { if ( dna2aa(TSCRIPT_AUX(tr)->ref+N_REF_PAD+cds->beg-tr->beg) != 'M' ) splice.check_start = 0; }
- else if ( tr->strand==STRAND_REV ) { if ( cdna2aa(TSCRIPT_AUX(tr)->ref+N_REF_PAD+cds->beg-tr->beg+cds->len-3) != 'M' ) splice.check_start = 0; }
+ // Do not check starts in incomplete CDS, defined as not starting with M
+ // Not this is not always true, there are alternative start codons
+ if ( tr->strand==STRAND_FWD ) { if ( dna2stop(TSCRIPT_AUX(tr)->ref+N_REF_PAD+cds->beg-tr->beg) != 'M' ) splice.check_start = 0; }
+ else if ( tr->strand==STRAND_REV ) { if ( cdna2stop(TSCRIPT_AUX(tr)->ref+N_REF_PAD+cds->beg-tr->beg+cds->len-3) != 'M' ) splice.check_start = 0; }
}
if ( child->icds!=0 ) splice.check_region_beg = 1;
if ( child->icds!=tr->ncds-1 ) splice.check_region_end = 1;
@@ -1367,7 +1620,7 @@ fprintf(bcftools_stderr,"cds splice_csq: %d [%s][%s] .. beg,end=%d %d, ret=%d, c
#endif
if ( ret==SPLICE_VAR_REF ) return 2; // not a variant, eg REF=CA ALT=CA
- if ( ret==SPLICE_OUTSIDE || ret==SPLICE_OVERLAP || splice.csq==CSQ_START_LOST ) // not a coding csq
+ if ( ret==SPLICE_OUTSIDE || ret==SPLICE_OVERLAP ) // not a coding csq
{
free(splice.kref.s);
free(splice.kalt.s);
@@ -1495,7 +1748,7 @@ void hap_destroy(hap_node_t *hap)
tseq: translated sequence (aa)
fill: frameshift, fill until the end (strand=fwd) or from the start (strand=rev)
*/
-void cds_translate(kstring_t *_ref, kstring_t *_seq, uint32_t sbeg, uint32_t rbeg, uint32_t rend, int strand, kstring_t *tseq, int fill)
+void cds_translate(kstring_t *_ref, kstring_t *_seq, uint32_t sbeg, uint32_t rbeg, uint32_t rend, int strand, kstring_t *tseq, kstring_t *tseq_stop, int fill)
{
#if XDBG
fprintf(bcftools_stderr,"\ntranslate: %d %d %d fill=%d seq.l=%d\n",sbeg,rbeg,rend,fill,(int)_seq->l);
@@ -1507,9 +1760,11 @@ fprintf(bcftools_stderr,"\ntranslate: %d %d %d fill=%d seq.l=%d\n",sbeg,rbeg,r
kstring_t seq = *_seq;
tseq->l = 0;
+ tseq_stop->l = 0;
if ( !seq.l )
{
kputc('?', tseq);
+ kputc('?', tseq_stop);
return;
}
@@ -1543,6 +1798,7 @@ fprintf(bcftools_stderr,"\ntranslate: %d %d %d fill=%d seq.l=%d\n",sbeg,rbeg,r
if ( i==3 )
{
kputc_(dna2aa(tmp), tseq);
+ kputc_(dna2stop(tmp), tseq_stop);
#if DBG>1
fprintf(bcftools_stderr,"[1]%c%c%c\n",tmp[0],tmp[1],tmp[2]);
#endif
@@ -1551,6 +1807,7 @@ fprintf(bcftools_stderr,"\ntranslate: %d %d %d fill=%d seq.l=%d\n",sbeg,rbeg,r
while ( codon < end )
{
kputc_(dna2aa(codon), tseq);
+ kputc_(dna2stop(codon), tseq_stop);
#if DBG>1
fprintf(bcftools_stderr,"[2]%c%c%c\n",codon[0],codon[1],codon[2]);
#endif
@@ -1574,6 +1831,7 @@ fprintf(bcftools_stderr,"\ntranslate: %d %d %d fill=%d seq.l=%d\n",sbeg,rbeg,r
codon++;
}
kputc_(dna2aa(tmp), tseq);
+ kputc_(dna2stop(tmp), tseq_stop);
#if DBG>1
fprintf(bcftools_stderr,"[4]%c%c%c\n",tmp[0],tmp[1],tmp[2]);
#endif
@@ -1584,6 +1842,7 @@ fprintf(bcftools_stderr,"\ntranslate: %d %d %d fill=%d seq.l=%d\n",sbeg,rbeg,r
while ( codon+3 <= end )
{
kputc_(dna2aa(codon), tseq);
+ kputc_(dna2stop(codon), tseq_stop);
#if DBG>1
fprintf(bcftools_stderr,"[5]%c%c%c\t%c\n",codon[0],codon[1],codon[2],dna2aa(codon));
#endif
@@ -1628,10 +1887,12 @@ fprintf(bcftools_stderr,"\ntranslate: %d %d %d fill=%d seq.l=%d\n",sbeg,rbeg,r
fprintf(bcftools_stderr,"[1]%c%c%c\t%c\n",tmp[0],tmp[1],tmp[2], cdna2aa(tmp));
#endif
kputc_(cdna2aa(tmp), tseq);
+ kputc_(cdna2stop(tmp), tseq_stop);
codon = end - 3;
while ( codon >= seq.s )
{
kputc_(cdna2aa(codon), tseq);
+ kputc_(cdna2stop(codon), tseq_stop);
#if DBG>1
fprintf(bcftools_stderr,"[2]%c%c%c\t%c\n",codon[0],codon[1],codon[2], cdna2aa(codon));
#endif
@@ -1661,6 +1922,7 @@ fprintf(bcftools_stderr,"\ntranslate: %d %d %d fill=%d seq.l=%d\n",sbeg,rbeg,r
{
for (; i>=0 && end>=ref.s; i--) tmp[i] = *(--end);
kputc_(cdna2aa(tmp), tseq);
+ kputc_(cdna2stop(tmp), tseq_stop);
#if DBG>1
fprintf(bcftools_stderr,"[4]%c%c%c\t%c\n",tmp[0],tmp[1],tmp[2],cdna2aa(tmp));
#endif
@@ -1671,6 +1933,7 @@ fprintf(bcftools_stderr,"\ntranslate: %d %d %d fill=%d seq.l=%d\n",sbeg,rbeg,r
while ( codon >= ref.s + N_REF_PAD )
{
kputc_(cdna2aa(codon), tseq);
+ kputc_(cdna2stop(codon), tseq_stop);
#if DBG>1
fprintf(bcftools_stderr,"[5]%c%c%c\t%c\n",codon[0],codon[1],codon[2],cdna2aa(codon));
#endif
@@ -1680,6 +1943,7 @@ fprintf(bcftools_stderr,"\ntranslate: %d %d %d fill=%d seq.l=%d\n",sbeg,rbeg,r
}
else error("Should not happen: %d\n", strand);
kputc_(0,tseq); tseq->l--;
+ kputc_(0,tseq_stop); tseq_stop->l--;
#if DBG
fprintf(bcftools_stderr," tseq: %s\n", tseq->s);
#endif
@@ -1773,6 +2037,9 @@ fprintf(bcftools_stderr,"csq_push: %d .. %d\n",(int)rec->pos+1,csq->type.type);
if ( vrec->vcsq[i].type&CSQ_STOP_RETAINED )
vrec->vcsq[i].type &= ~(CSQ_STOP_LOST|CSQ_SYNONYMOUS_VARIANT);
+ if ( vrec->vcsq[i].type&CSQ_START_RETAINED )
+ vrec->vcsq[i].type &= ~(CSQ_START_LOST|CSQ_SYNONYMOUS_VARIANT);
+
if ( !vrec->vcsq[i].vstr.s ) vrec->vcsq[i].vstr = csq->type.vstr;
goto exit_duplicate;
}
@@ -1870,14 +2137,14 @@ void kput_vcsq(args_t *args, vcsq_t *csq, kstring_t *str)
kputs(csq->vstr.s, str);
}
-void kprint_aa_prediction(args_t *args, int beg, kstring_t *aa, kstring_t *str)
+void kprint_aa_prediction(args_t *args, int beg, kstring_t *aa, kstring_t *stop, kstring_t *str)
{
if ( !args->brief_predictions || (int)aa->l - args->brief_predictions < 3 )
kputs(aa->s, str);
else
{
int i, len = aa->l;
- if ( aa->s[len-1]=='*' ) len--;
+ if ( stop->s[len-1]=='*' ) len--;
for (i=0; ibrief_predictions; i++) kputc(aa->s[i], str);
kputs("..", str);
kputw(beg+len, str);
@@ -1911,33 +2178,37 @@ void hap_add_csq(args_t *args, hap_t *hap, hap_node_t *node, int tlen, int ibeg,
if ( hap->stack[ibeg].node->type != HAP_SSS )
{
// check for truncating stops
- for (i=0; itref.l; i++)
- if ( hap->tref.s[i]=='*' ) break;
- if ( i!=hap->tref.l )
+ for (i=0; itref_stop.l; i++)
+ if ( hap->tref_stop.s[i]=='*' ) break;
+ if ( i!=hap->tref_stop.l )
{
hap->tref.l = i+1;
hap->tref.s[i+1] = 0;
+ hap->tref_stop.l = i+1;
+ hap->tref_stop.s[i+1] = 0;
}
- for (i=0; itseq.l; i++)
- if ( hap->tseq.s[i]=='*' ) break;
+ for (i=0; itseq_stop.l; i++)
+ if ( hap->tseq_stop.s[i]=='*' ) break;
if ( i!=hap->tseq.l )
{
hap->tseq.l = i+1;
hap->tseq.s[i+1] = 0;
+ hap->tseq_stop.l = i+1;
+ hap->tseq_stop.s[i+1] = 0;
hap->upstream_stop = 1;
}
if ( csq->type.type & CSQ_STOP_LOST )
{
- if ( hap->tref.s[hap->tref.l-1]=='*' && hap->tref.s[hap->tref.l-1] == hap->tseq.s[hap->tseq.l-1] )
+ if ( hap->tref_stop.s[hap->tref_stop.l-1]=='*' && hap->tref_stop.s[hap->tref_stop.l-1] == hap->tseq_stop.s[hap->tseq_stop.l-1] )
{
rm_csq |= CSQ_STOP_LOST;
csq->type.type |= CSQ_STOP_RETAINED;
}
- else if ( hap->tref.s[hap->tref.l-1]!='*' )
+ else if ( hap->tref_stop.s[hap->tref_stop.l-1]!='*' )
{
// This is CDS 3' incomplete ENSG00000173376/synon.vcf, can also be missense
// We observe in real data a change to a stop, ENST00000528237/retained-stop-incomplete-cds.vcf
- if ( hap->tseq.s[hap->tseq.l-1] == '*' )
+ if ( hap->tseq_stop.s[hap->tseq_stop.l-1] == '*' )
{
rm_csq |= CSQ_STOP_GAINED;
csq->type.type |= CSQ_STOP_RETAINED;
@@ -1946,10 +2217,13 @@ void hap_add_csq(args_t *args, hap_t *hap, hap_node_t *node, int tlen, int ibeg,
csq->type.type |= CSQ_INCOMPLETE_CDS;
}
}
- if ( csq->type.type & CSQ_START_LOST && hap->tref.s[0]!='M' )
+ if ( csq->type.type & CSQ_START_LOST )
{
- rm_csq |= CSQ_START_LOST;
- csq->type.type &= ~CSQ_START_LOST;
+ if ( hap->tref_stop.s[hap->tref_stop.l-1]=='M' && hap->tref_stop.s[hap->tref_stop.l-1] == hap->tseq_stop.s[hap->tseq_stop.l-1] )
+ {
+ rm_csq |= CSQ_START_LOST;
+ csq->type.type |= CSQ_START_RETAINED;
+ }
}
if ( dlen!=0 )
{
@@ -1959,7 +2233,7 @@ void hap_add_csq(args_t *args, hap_t *hap, hap_node_t *node, int tlen, int ibeg,
csq->type.type |= CSQ_INFRAME_DELETION;
else
csq->type.type |= CSQ_INFRAME_INSERTION;
- if ( hap->tref.s[hap->tref.l-1]!='*' && hap->tseq.s[hap->tseq.l-1]=='*' )
+ if ( hap->tref_stop.s[hap->tref_stop.l-1]!='*' && hap->tseq_stop.s[hap->tseq_stop.l-1]=='*' )
csq->type.type |= CSQ_STOP_GAINED;
}
else
@@ -1969,9 +2243,9 @@ void hap_add_csq(args_t *args, hap_t *hap, hap_node_t *node, int tlen, int ibeg,
{
if ( hap->tref.s[i] == hap->tseq.s[i] ) continue;
aa_change = 1;
- if ( hap->tref.s[i] == '*' )
+ if ( hap->tref_stop.s[i] == '*' )
csq->type.type |= CSQ_STOP_LOST;
- else if ( hap->tseq.s[i] == '*' )
+ else if ( hap->tseq_stop.s[i] == '*' )
csq->type.type |= CSQ_STOP_GAINED;
else
csq->type.type |= CSQ_MISSENSE_VARIANT;
@@ -1981,11 +2255,19 @@ void hap_add_csq(args_t *args, hap_t *hap, hap_node_t *node, int tlen, int ibeg,
}
}
// Check if compound inframe variants are real inframes, or if the stop codon occurs before the frameshift can be restored
- if ( ibeg!=iend && (csq->type.type & (CSQ_INFRAME_DELETION|CSQ_INFRAME_INSERTION|CSQ_INFRAME_ALTERING)) && hap->tseq.s[hap->tseq.l-1]=='*' )
+ if ( ibeg!=iend && (csq->type.type & (CSQ_INFRAME_DELETION|CSQ_INFRAME_INSERTION|CSQ_INFRAME_ALTERING)) && hap->tseq_stop.s[hap->tseq_stop.l-1]=='*' )
{
rm_csq |= CSQ_INFRAME_DELETION | CSQ_INFRAME_INSERTION | CSQ_INFRAME_ALTERING;
csq->type.type |= CSQ_FRAMESHIFT_VARIANT | CSQ_STOP_GAINED;
}
+ if ( csq->type.type & CSQ_FRAMESHIFT_VARIANT && csq->type.type & CSQ_START_LOST )
+ {
+ // this is to prevent
+ // 4959 GA G start_lost|NBPF3|ENST00000318249|protein_coding|+
+ // 4959 GA G start_lost|NBPF3|ENST00000318249|protein_coding|+|1M>1?|4959GA>G
+ rm_csq |= CSQ_FRAMESHIFT_VARIANT;
+ hap->stack[ibeg].node->type = HAP_SSS;
+ }
if ( has_upstream_stop ) csq->type.type |= CSQ_UPSTREAM_STOP;
csq->type.type &= ~rm_csq;
@@ -2006,12 +2288,12 @@ void hap_add_csq(args_t *args, hap_t *hap, hap_node_t *node, int tlen, int ibeg,
int aa_sbeg = tr->strand==STRAND_FWD ? node2sbeg(ibeg)/3+1 : (tlen - node2send(iend))/3+1;
kputc_('|', &str);
kputw(aa_rbeg, &str);
- kprint_aa_prediction(args,aa_rbeg,&hap->tref,&str);
+ kprint_aa_prediction(args,aa_rbeg,&hap->tref,&hap->tref_stop,&str);
if ( !(csq->type.type & CSQ_SYNONYMOUS_VARIANT) )
{
kputc_('>', &str);
kputw(aa_sbeg, &str);
- kprint_aa_prediction(args,aa_sbeg,&hap->tseq,&str);
+ kprint_aa_prediction(args,aa_sbeg,&hap->tseq,&hap->tseq_stop,&str);
}
kputc_('|', &str);
@@ -2082,6 +2364,7 @@ void hap_finalize(args_t *args, hap_t *hap)
hap->sseq.l = 0;
hap->tseq.l = 0;
+ hap->tseq_stop.l = 0;
hap->stack[0].node = TSCRIPT_AUX(tr)->root;
hap->stack[0].ichild = -1;
hap->stack[0].slen = 0;
@@ -2169,13 +2452,13 @@ void hap_finalize(args_t *args, hap_t *hap)
}
else // splice site overlap, see #1475227917
sseq.l = fill = 0;
- cds_translate(&sref, &sseq, icur,rbeg,rend, tr->strand, &hap->tseq, fill);
+ cds_translate(&sref, &sseq, icur,rbeg,rend, tr->strand, &hap->tseq, &hap->tseq_stop, fill);
// ref
sseq.l = node2rend(i) - rbeg;
sseq.s = sref.s + N_REF_PAD + rbeg;
sseq.m = sref.m - 2*N_REF_PAD;
- cds_translate(&sref, &sseq, rbeg,rbeg,rend, tr->strand, &hap->tref, fill);
+ cds_translate(&sref, &sseq, rbeg,rbeg,rend, tr->strand, &hap->tref, &hap->tref_stop, fill);
sseq.m = sref.m - 2*N_REF_PAD + hap->stack[istack].dlen;
hap_add_csq(args,hap,node,0, ibeg,i,dlen,indel);
@@ -2227,13 +2510,13 @@ void hap_finalize(args_t *args, hap_t *hap)
}
else // splice site overlap, see #1475227917
sseq.l = fill = 0;
- cds_translate(&sref, &sseq, icur,rbeg,rend, tr->strand, &hap->tseq, fill);
+ cds_translate(&sref, &sseq, icur,rbeg,rend, tr->strand, &hap->tseq, &hap->tseq_stop, fill);
// ref
sseq.l = node2rend(ibeg) - rbeg;
sseq.s = sref.s + N_REF_PAD + rbeg;
sseq.m = sref.m - 2*N_REF_PAD;
- cds_translate(&sref, &sseq, rbeg,rbeg,rend, tr->strand, &hap->tref, fill);
+ cds_translate(&sref, &sseq, rbeg,rbeg,rend, tr->strand, &hap->tref, &hap->tref_stop, fill);
sseq.m = sref.m - 2*N_REF_PAD + hap->stack[istack].dlen;
hap_add_csq(args,hap,node,sseq.m, i,ibeg,dlen,indel);
@@ -2491,13 +2774,7 @@ void tscript_init_ref(args_t *args, gf_tscript_t *tr, const char *chr)
int i, len;
int pad_beg = tr->beg >= N_REF_PAD ? N_REF_PAD : tr->beg;
- const char *tmp_chr = chr;
- if ( !faidx_has_seq(args->fai,tmp_chr) )
- {
- tmp_chr = drop_chr_prefix(args,chr);
- if ( !faidx_has_seq(args->fai,tmp_chr) ) tmp_chr = add_chr_prefix(args,chr);
- }
- TSCRIPT_AUX(tr)->ref = faidx_fetch_seq(args->fai, tmp_chr, tr->beg - pad_beg, tr->end + N_REF_PAD, &len);
+ TSCRIPT_AUX(tr)->ref = faidx_fetch_seq(args->fai, chr, tr->beg - pad_beg, tr->end + N_REF_PAD, &len);
if ( !TSCRIPT_AUX(tr)->ref )
error("faidx_fetch_seq failed %s:%d-%d\n", chr,tr->beg+1,tr->end+1);
@@ -2515,7 +2792,8 @@ void tscript_init_ref(args_t *args, gf_tscript_t *tr, const char *chr)
}
}
-static void sanity_check_ref(args_t *args, gf_tscript_t *tr, bcf1_t *rec)
+// returns 0 on success, negative number on reference mismatch
+static int sanity_check_ref(args_t *args, gf_tscript_t *tr, bcf1_t *rec)
{
int vbeg = 0;
int rbeg = rec->pos - tr->beg + N_REF_PAD;
@@ -2527,23 +2805,40 @@ static void sanity_check_ref(args_t *args, gf_tscript_t *tr, bcf1_t *rec)
while ( ref[i] && vcf[i] )
{
if ( ref[i]!=vcf[i] && toupper(ref[i])!=toupper(vcf[i]) )
- error("Error: the fasta reference does not match the VCF REF allele at %s:%"PRId64" .. fasta=%c vcf=%c\n",
- bcf_seqname(args->hdr,rec),(int64_t) rec->pos+vbeg+1,ref[i],vcf[i]);
+ {
+ if ( !args->force )
+ error("Error: the fasta reference does not match the VCF REF allele at %s:%"PRId64" .. fasta=%c vcf=%c\n",
+ bcf_seqname(args->hdr,rec),(int64_t) rec->pos+vbeg+1,ref[i],vcf[i]);
+
+ else if ( args->verbosity && (!args->warned.ref_allele_mismatch || args->verbosity > 1) )
+ {
+ fprintf(bcftools_stderr,"Warning: the fasta reference does not match the VCF REF allele at %s:%"PRId64" .. fasta=%c vcf=%c\n",
+ bcf_seqname(args->hdr,rec),(int64_t) rec->pos+vbeg+1,ref[i],vcf[i]);
+ if ( args->verbosity < 2 )
+ fprintf(bcftools_stderr," This message is printed only once, the verbosity can be increased with `--verbosity 2`\n");
+ }
+ args->warned.ref_allele_mismatch++;
+ return -1;
+ }
i++;
}
+ return 0;
}
int test_cds_local(args_t *args, bcf1_t *rec)
{
int i,j, ret = 0;
- const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,rec));
+ const char *chr_vcf = bcf_seqname(args->hdr,rec);
+ const char *chr_gff = unify_chr_name(args, chr_vcf, CHR_VCF,CHR_GFF);
+ const char *chr_fai = unify_chr_name(args, chr_vcf, CHR_VCF,CHR_FAI);
// note that the off-by-one extension of rlen is deliberate to account for insertions
- if ( !regidx_overlap(args->idx_cds,chr,rec->pos,rec->pos+rec->rlen, args->itr) ) return 0;
+ if ( !regidx_overlap(args->idx_cds,chr_gff,rec->pos,rec->pos+rec->rlen, args->itr) ) return 0;
// structures to fake the normal test_cds machinery
hap_node_t root, node;
root.type = HAP_ROOT;
kstring_t *tref = &args->hap->tref, *tseq = &args->hap->tseq;
+ kstring_t *tref_stop = &args->hap->tref_stop, *tseq_stop = &args->hap->tseq_stop;
while ( regitr_overlap(args->itr) )
{
@@ -2555,12 +2850,12 @@ int test_cds_local(args_t *args, bcf1_t *rec)
if ( !TSCRIPT_AUX(tr) )
{
tr->aux = calloc(sizeof(tscript_t),1);
- tscript_init_ref(args, tr, chr);
+ tscript_init_ref(args, tr, chr_fai);
tscript_splice_ref(tr);
khp_insert(trhp, args->active_tr, &tr); // only to clean the reference afterwards
}
- sanity_check_ref(args, tr, rec);
+ if ( sanity_check_ref(args, tr, rec)<0 ) continue;
kstring_t sref;
sref.s = TSCRIPT_AUX(tr)->sref;
@@ -2596,40 +2891,44 @@ int test_cds_local(args_t *args, bcf1_t *rec)
sseq.s = node.seq;
int alen = sseq.l = strlen(sseq.s);
int fill = node.dlen%3 && alen ? 1 : 0; // see #1475227917
- cds_translate(&sref, &sseq, node.sbeg,node.sbeg,node.sbeg+node.rlen, tr->strand, tseq, fill);
+ cds_translate(&sref, &sseq, node.sbeg,node.sbeg,node.sbeg+node.rlen, tr->strand, tseq, tseq_stop, fill);
sseq.m = sref.m - 2*N_REF_PAD;
sseq.s = sref.s + N_REF_PAD + node.sbeg;
sseq.l = node.rlen;
- cds_translate(&sref, &sseq, node.sbeg,node.sbeg,node.sbeg+node.rlen, tr->strand, tref, fill);
+ cds_translate(&sref, &sseq, node.sbeg,node.sbeg,node.sbeg+node.rlen, tr->strand, tref, tref_stop, fill);
// check for truncating stops
- for (j=0; jl; j++)
- if ( tref->s[j]=='*' ) break;
- if ( j!=tref->l )
+ for (j=0; jl; j++)
+ if ( tref_stop->s[j]=='*' ) break;
+ if ( j!=tref_stop->l )
{
tref->l = j+1;
tref->s[j+1] = 0;
+ tref_stop->l = j+1;
+ tref_stop->s[j+1] = 0;
}
- for (j=0; jl; j++)
- if ( tseq->s[j]=='*' ) break;
+ for (j=0; jl; j++)
+ if ( tseq_stop->s[j]=='*' ) break;
if ( j!=tseq->l )
{
tseq->l = j+1;
tseq->s[j+1] = 0;
+ tseq_stop->l = j+1;
+ tseq_stop->s[j+1] = 0;
}
if ( csq_type & CSQ_STOP_LOST )
{
- if ( tref->s[tref->l-1]=='*' && tref->s[tref->l-1] == tseq->s[tseq->l-1] )
+ if ( tref_stop->s[tref_stop->l-1]=='*' && tref_stop->s[tref_stop->l-1] == tseq_stop->s[tseq_stop->l-1] )
{
csq_type &= ~CSQ_STOP_LOST;
csq_type |= CSQ_STOP_RETAINED;
}
- else if (tref->s[tref->l-1]!='*' )
+ else if (tref_stop->s[tref_stop->l-1]!='*' )
{
// This is CDS 3' incomplete ENSG00000173376/synon.vcf, can also be missense
// We observe in real data a change to a stop, ENST00000528237/retained-stop-incomplete-cds.vcf
- if ( tseq->s[tseq->l-1] == '*' )
+ if ( tseq_stop->s[tseq_stop->l-1] == '*' )
{
csq_type &= ~CSQ_STOP_GAINED;
csq_type |= CSQ_STOP_RETAINED;
@@ -2638,7 +2937,7 @@ int test_cds_local(args_t *args, bcf1_t *rec)
csq_type |= CSQ_INCOMPLETE_CDS;
}
}
- if ( csq_type & CSQ_START_LOST && tref->s[0]!='M' )
+ if ( csq_type & CSQ_START_LOST && tref_stop->s[0]!='M' )
csq_type &= ~CSQ_START_LOST;
if ( node.dlen!=0 )
{
@@ -2648,8 +2947,20 @@ int test_cds_local(args_t *args, bcf1_t *rec)
csq_type |= CSQ_INFRAME_DELETION;
else
csq_type |= CSQ_INFRAME_INSERTION;
- if ( tref->s[tref->l-1]!='*' && tseq->s[tseq->l-1]=='*' )
+ if ( tref_stop->s[tref_stop->l-1]!='*' && tseq_stop->s[tseq_stop->l-1]=='*' )
csq_type |= CSQ_STOP_GAINED;
+ if ( csq_type & CSQ_START_LOST && csq_type & CSQ_FRAMESHIFT_VARIANT )
+ {
+ // this is to prevent
+ // 4959 GA G start_lost|NBPF3|ENST00000318249|protein_coding|+
+ // 4959 GA G start_lost|NBPF3|ENST00000318249|protein_coding|+|1M>1?|4959GA>G
+ csq_type &= ~CSQ_FRAMESHIFT_VARIANT;
+ node.type = HAP_SSS;
+ csq_stage(args, &csq, rec);
+ free(node.seq);
+ free(node.var);
+ continue;
+ }
}
else
{
@@ -2658,9 +2969,9 @@ int test_cds_local(args_t *args, bcf1_t *rec)
{
if ( tref->s[j] == tseq->s[j] ) continue;
aa_change = 1;
- if ( tref->s[j] == '*' )
+ if ( tref_stop->s[j] == '*' )
csq_type |= CSQ_STOP_LOST;
- else if ( tseq->s[j] == '*' )
+ else if ( tseq_stop->s[j] == '*' )
csq_type |= CSQ_STOP_GAINED;
else
csq_type |= CSQ_MISSENSE_VARIANT;
@@ -2676,12 +2987,12 @@ int test_cds_local(args_t *args, bcf1_t *rec)
int aa_sbeg = tr->strand==STRAND_FWD ? node.sbeg/3+1 : (TSCRIPT_AUX(tr)->nsref - 2*N_REF_PAD + node.dlen - node.sbeg - alen)/3+1;
kputc_('|', &str);
kputw(aa_rbeg, &str);
- kprint_aa_prediction(args,aa_rbeg,tref,&str);
+ kprint_aa_prediction(args,aa_rbeg,tref,tref_stop,&str);
if ( !(csq_type & CSQ_SYNONYMOUS_VARIANT) )
{
kputc_('>', &str);
kputw(aa_sbeg, &str);
- kprint_aa_prediction(args,aa_sbeg,tseq,&str);
+ kprint_aa_prediction(args,aa_sbeg,tseq,tseq_stop,&str);
}
kputc_('|', &str);
kputw(rec->pos+1, &str);
@@ -2717,9 +3028,11 @@ int test_cds(args_t *args, bcf1_t *rec, vbuf_t *vbuf)
static int overlaps_warned = 0, multiploid_warned = 0;
int i, ret = 0, hap_ret;
- const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,rec));
+ const char *chr_vcf = bcf_seqname(args->hdr,rec);
+ const char *chr_gff = unify_chr_name(args, chr_vcf, CHR_VCF,CHR_GFF);
+ const char *chr_fai = unify_chr_name(args, chr_vcf, CHR_VCF,CHR_FAI);
// note that the off-by-one extension of rlen is deliberate to account for insertions
- if ( !regidx_overlap(args->idx_cds,chr,rec->pos,rec->pos+rec->rlen, args->itr) ) return 0;
+ if ( !regidx_overlap(args->idx_cds,chr_gff,rec->pos,rec->pos+rec->rlen, args->itr) ) return 0;
while ( regitr_overlap(args->itr) )
{
gf_cds_t *cds = regitr_payload(args->itr,gf_cds_t*);
@@ -2731,7 +3044,7 @@ int test_cds(args_t *args, bcf1_t *rec, vbuf_t *vbuf)
{
// initialize the transcript and its haplotype tree, fetch the reference sequence
tr->aux = calloc(sizeof(tscript_t),1);
- tscript_init_ref(args, tr, chr);
+ tscript_init_ref(args, tr, chr_fai);
TSCRIPT_AUX(tr)->root = (hap_node_t*) calloc(1,sizeof(hap_node_t));
TSCRIPT_AUX(tr)->nhap = args->phase==PHASE_DROP_GT ? 1 : 2*args->smpl->n; // maximum ploidy = diploid
@@ -2743,7 +3056,7 @@ int test_cds(args_t *args, bcf1_t *rec, vbuf_t *vbuf)
khp_insert(trhp, args->active_tr, &tr);
}
- sanity_check_ref(args, tr, rec);
+ if ( sanity_check_ref(args, tr, rec)<0 ) continue;
if ( args->phase==PHASE_DROP_GT )
{
@@ -2760,13 +3073,13 @@ int test_cds(args_t *args, bcf1_t *rec, vbuf_t *vbuf)
{
fprintf(bcftools_stderr,
"Warning: Skipping overlapping variants at %s:%"PRId64"\t%s>%s.\n",
- chr,(int64_t) rec->pos+1,rec->d.allele[0],rec->d.allele[1]);
+ chr_vcf,(int64_t) rec->pos+1,rec->d.allele[0],rec->d.allele[1]);
if ( !overlaps_warned )
- fprintf(bcftools_stderr," This message is printed only once, the verbosity can be increased with `--verbose 2`\n");
+ fprintf(bcftools_stderr," This message is printed only once, the verbosity can be increased with `--verbosity 2`\n");
overlaps_warned = 1;
}
if ( args->out )
- fprintf(args->out,"LOG\tWarning: Skipping overlapping variants at %s:%"PRId64"\t%s>%s\n", chr,(int64_t) rec->pos+1,rec->d.allele[0],rec->d.allele[1]);
+ fprintf(args->out,"LOG\tWarning: Skipping overlapping variants at %s:%"PRId64"\t%s>%s\n", chr_vcf,(int64_t) rec->pos+1,rec->d.allele[0],rec->d.allele[1]);
}
else ret = 1; // prevent reporting as intron in test_tscript
hap_destroy(child);
@@ -2807,13 +3120,13 @@ int test_cds(args_t *args, bcf1_t *rec, vbuf_t *vbuf)
{
fprintf(bcftools_stderr,
"Warning: Skipping site with non-diploid/non-haploid genotypes at %s:%"PRId64"\t%s>%s.\n",
- chr,(int64_t) rec->pos+1,rec->d.allele[0],rec->d.allele[1]);
+ chr_vcf,(int64_t) rec->pos+1,rec->d.allele[0],rec->d.allele[1]);
if ( !multiploid_warned )
- fprintf(bcftools_stderr," This message is printed only once, the verbosity can be increased with `--verbose 2`\n");
+ fprintf(bcftools_stderr," This message is printed only once, the verbosity can be increased with `--verbosity 2`\n");
multiploid_warned = 1;
}
if ( args->out )
- fprintf(args->out,"LOG\tWarning: Skipping site with non-diploid/non-haploid genotypes at %s:%"PRId64"\t%s>%s\n", chr,(int64_t) rec->pos+1,rec->d.allele[0],rec->d.allele[1]);
+ fprintf(args->out,"LOG\tWarning: Skipping site with non-diploid/non-haploid genotypes at %s:%"PRId64"\t%s>%s\n", chr_vcf,(int64_t) rec->pos+1,rec->d.allele[0],rec->d.allele[1]);
continue;
}
for (ismpl=0; ismplsmpl->n; ismpl++)
@@ -2830,7 +3143,7 @@ int test_cds(args_t *args, bcf1_t *rec, vbuf_t *vbuf)
if ( !bcf_gt_is_phased(gt[0]) && !bcf_gt_is_phased(gt[1]) )
{
if ( args->phase==PHASE_REQUIRE )
- error("Unphased heterozygous genotype at %s:%"PRId64", sample %s. See the --phase option.\n", chr,(int64_t) rec->pos+1,args->hdr->samples[args->smpl->idx[ismpl]]);
+ error("Unphased heterozygous genotype at %s:%"PRId64", sample %s. See the --phase option.\n", chr_vcf,(int64_t) rec->pos+1,args->hdr->samples[args->smpl->idx[ismpl]]);
if ( args->phase==PHASE_SKIP )
continue;
if ( args->phase==PHASE_NON_REF )
@@ -2873,14 +3186,14 @@ int test_cds(args_t *args, bcf1_t *rec, vbuf_t *vbuf)
{
fprintf(bcftools_stderr,
"Warning: Skipping overlapping variants at %s:%"PRId64", sample %s\t%s>%s.\n",
- chr,(int64_t) rec->pos+1,args->hdr->samples[args->smpl->idx[ismpl]],rec->d.allele[0],rec->d.allele[ial]);
+ chr_vcf,(int64_t) rec->pos+1,args->hdr->samples[args->smpl->idx[ismpl]],rec->d.allele[0],rec->d.allele[ial]);
if ( !overlaps_warned )
- fprintf(bcftools_stderr," This message is printed only once, the verbosity can be increased with `--verbose 2`\n");
+ fprintf(bcftools_stderr," This message is printed only once, the verbosity can be increased with `--verbosity 2`\n");
overlaps_warned = 1;
}
if ( args->out )
fprintf(args->out,"LOG\tWarning: Skipping overlapping variants at %s:%"PRId64", sample %s\t%s>%s\n",
- chr,(int64_t) rec->pos+1,args->hdr->samples[args->smpl->idx[ismpl]],rec->d.allele[0],rec->d.allele[ial]);
+ chr_vcf,(int64_t) rec->pos+1,args->hdr->samples[args->smpl->idx[ismpl]],rec->d.allele[0],rec->d.allele[ial]);
}
hap_destroy(child);
continue;
@@ -2992,9 +3305,10 @@ void csq_stage(args_t *args, csq_t *csq, bcf1_t *rec)
}
int test_utr(args_t *args, bcf1_t *rec)
{
- const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,rec));
+ const char *chr_vcf = bcf_seqname(args->hdr,rec);
+ const char *chr_gff = unify_chr_name(args, chr_vcf, CHR_VCF,CHR_GFF);
// note that the off-by-one extension of rlen is deliberate to account for insertions
- if ( !regidx_overlap(args->idx_utr,chr,rec->pos,rec->pos+rec->rlen, args->itr) ) return 0;
+ if ( !regidx_overlap(args->idx_utr,chr_gff,rec->pos,rec->pos+rec->rlen, args->itr) ) return 0;
splice_t splice;
splice_init(&splice, rec);
@@ -3030,8 +3344,9 @@ int test_utr(args_t *args, bcf1_t *rec)
}
int test_splice(args_t *args, bcf1_t *rec)
{
- const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,rec));
- if ( !regidx_overlap(args->idx_exon,chr,rec->pos,rec->pos + rec->rlen, args->itr) ) return 0;
+ const char *chr_vcf = bcf_seqname(args->hdr,rec);
+ const char *chr_gff = unify_chr_name(args, chr_vcf, CHR_VCF,CHR_GFF);
+ if ( !regidx_overlap(args->idx_exon,chr_gff,rec->pos,rec->pos + rec->rlen, args->itr) ) return 0;
splice_t splice;
splice_init(&splice, rec);
@@ -3062,8 +3377,9 @@ int test_splice(args_t *args, bcf1_t *rec)
}
int test_tscript(args_t *args, bcf1_t *rec)
{
- const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,rec));
- if ( !regidx_overlap(args->idx_tscript,chr,rec->pos,rec->pos+rec->rlen, args->itr) ) return 0;
+ const char *chr_vcf = bcf_seqname(args->hdr,rec);
+ const char *chr_gff = unify_chr_name(args, chr_vcf, CHR_VCF,CHR_GFF);
+ if ( !regidx_overlap(args->idx_tscript,chr_gff,rec->pos,rec->pos+rec->rlen, args->itr) ) return 0;
splice_t splice;
splice_init(&splice, rec);
@@ -3105,7 +3421,8 @@ void test_symbolic_alt(args_t *args, bcf1_t *rec)
warned = 1;
}
- const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,rec));
+ const char *chr_vcf = bcf_seqname(args->hdr,rec);
+ const char *chr_gff = unify_chr_name(args, chr_vcf, CHR_VCF,CHR_GFF);
// only insertions atm
int beg = rec->pos + 1;
@@ -3113,7 +3430,7 @@ void test_symbolic_alt(args_t *args, bcf1_t *rec)
int csq_class = CSQ_ELONGATION;
int hit = 0;
- if ( regidx_overlap(args->idx_cds,chr,beg,end, args->itr) )
+ if ( regidx_overlap(args->idx_cds,chr_gff,beg,end, args->itr) )
{
while ( regitr_overlap(args->itr) )
{
@@ -3131,7 +3448,7 @@ void test_symbolic_alt(args_t *args, bcf1_t *rec)
hit = 1;
}
}
- if ( regidx_overlap(args->idx_utr,chr,beg,end, args->itr) )
+ if ( regidx_overlap(args->idx_utr,chr_gff,beg,end, args->itr) )
{
while ( regitr_overlap(args->itr) )
{
@@ -3149,7 +3466,7 @@ void test_symbolic_alt(args_t *args, bcf1_t *rec)
hit = 1;
}
}
- if ( regidx_overlap(args->idx_exon,chr,beg,end, args->itr) )
+ if ( regidx_overlap(args->idx_exon,chr_gff,beg,end, args->itr) )
{
splice_t splice;
splice_init(&splice, rec);
@@ -3168,7 +3485,7 @@ void test_symbolic_alt(args_t *args, bcf1_t *rec)
if ( splice.csq ) hit = 1;
}
}
- if ( !hit && regidx_overlap(args->idx_tscript,chr,beg,end, args->itr) )
+ if ( !hit && regidx_overlap(args->idx_tscript,chr_gff,beg,end, args->itr) )
{
splice_t splice;
splice_init(&splice, rec);
@@ -3229,6 +3546,7 @@ static void process(args_t *args, bcf1_t **rec_ptr)
bcf1_t *rec = *rec_ptr;
static int32_t prev_rid = -1, prev_pos = -1;
+ const char *chr_vcf = bcf_seqname(args->hdr,rec);
if ( prev_rid!=rec->rid )
{
prev_rid = rec->rid;
@@ -3237,14 +3555,28 @@ static void process(args_t *args, bcf1_t **rec_ptr)
// Common error is to use different naming conventions in the fasta and the VCF (e.g. X vs chrX).
// Perform a simple sanity check (that does not catch much), the chromosome must be present in the
// reference file
- if ( !faidx_has_seq(args->fai,bcf_seqname(args->hdr,rec)) )
+ const char *chr_fai = unify_chr_name(args, chr_vcf, CHR_VCF,CHR_FAI);
+ if ( !faidx_has_seq(args->fai,chr_fai) )
{
- if ( !faidx_has_seq(args->fai,drop_chr_prefix(args,bcf_seqname(args->hdr,rec))) && !faidx_has_seq(args->fai,add_chr_prefix(args,bcf_seqname(args->hdr,rec))) )
- error("Error: the chromosome \"%s\" is not present in %s\n",bcf_seqname(args->hdr,rec),args->fa_fname);
+ static int missing_chr_fai_warned = 0;
+ if ( !args->force )
+ error("Error: the chromosome \"%s\" is not present in %s\n %s\n",chr_fai,args->fa_fname,args->unify_chr_names_err);
+ else if ( !missing_chr_fai_warned++ )
+ fprintf(bcftools_stderr,"Warning: the chromosome \"%s\" is not present in %s. This warning is printed only once.\n",chr_fai,args->fa_fname);
+ }
+
+ const char *chr_gff = unify_chr_name(args, chr_vcf, CHR_VCF,CHR_GFF);
+ if ( !gff_has_seq(args->gff,chr_gff) )
+ {
+ static int missing_chr_gff_warned = 0;
+ if ( !args->force )
+ error("Error: the chromosome \"%s\" is not present in %s\n %s\n",chr_gff,args->gff_fname,args->unify_chr_names_err);
+ else if ( !missing_chr_gff_warned++ )
+ fprintf(bcftools_stderr,"Warning: the chromosome \"%s\" is not present in %s. This warning is printed only once.\n",chr_gff,args->gff_fname);
}
}
if ( prev_pos > rec->pos )
- error("Error: The file is not sorted, %s:%d comes before %s:%"PRId64"\n",bcf_seqname(args->hdr,rec),prev_pos+1,bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1);
+ error("Error: The file is not sorted, %s:%d comes before %s:%"PRId64"\n",chr_vcf,prev_pos+1,bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1);
int call_csq = 1;
if ( rec->n_allele < 2 ) call_csq = 0; // no alternate allele
@@ -3307,6 +3639,7 @@ static const char *usage(void)
"\n"
"CSQ options:\n"
" -B, --trim-protein-seq INT Abbreviate protein-changing predictions to max INT aminoacids\n"
+ " -C, --genetic-code INT|l Specify the genetic code table to use, 'l' to print a list [0]\n"
" -c, --custom-tag STRING Use this tag instead of the default BCSQ\n"
" -l, --local-csq Localized predictions, consider only one VCF record at a time\n"
" -n, --ncsq INT Maximum number of per-haplotype consequences to consider for each site [15]\n"
@@ -3319,7 +3652,8 @@ static const char *usage(void)
"GFF options:\n"
" --dump-gff FILE.gz Dump the parsed GFF file (for debugging purposes)\n"
" --force Run even if some sanity checks fail\n"
- " --unify-chr-names 1|0 Automatically unify chromosome naming (e.g. chrX vs X) in GFF, fasta, and VCF [1]\n"
+ " --unify-chr-names 0|LIST Unify chromosome naming by stripping a prefix in VCF,GFF,fasta, respectively [0]\n"
+ " (e.g., \"chr,Chr,-\" trims \"chr\" in VCF and \"Chr\" in GFF, fasta is unchanged)\n"
"General options:\n"
" -e, --exclude EXPR Exclude sites for which the expression is true\n"
" -i, --include EXPR Select sites for which the expression is true\n"
@@ -3336,7 +3670,7 @@ static const char *usage(void)
" -T, --targets-file FILE Similar to -R but streams rather than index-jumps\n"
" --targets-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n"
" --threads INT Use multithreading with worker threads [0]\n"
- " -v, --verbose INT Verbosity level 0-2 [1]\n"
+ " -v, --verbosity INT Verbosity level 0-6 [1]\n"
" -W, --write-index[=FMT] Automatically index the output files [off]\n"
"\n"
"Example:\n"
@@ -3358,11 +3692,11 @@ int main_csq(int argc, char *argv[])
args->verbosity = 1;
args->record_cmd_line = 1;
args->clevel = -1;
- args->unify_chr_names = 1;
static struct option loptions[] =
{
{"force",0,0,1},
+ {"genetic-code",required_argument,NULL,'C'},
{"threads",required_argument,NULL,2},
{"help",0,0,'h'},
{"ncsq",1,0,'n'},
@@ -3379,6 +3713,7 @@ int main_csq(int argc, char *argv[])
{"phase",1,0,'p'},
{"quiet",0,0,'q'},
{"verbose",1,0,'v'},
+ {"verbosity",1,0,'v'},
{"regions",1,0,'r'},
{"regions-file",1,0,'R'},
{"regions-overlap",required_argument,NULL,4},
@@ -3397,7 +3732,7 @@ int main_csq(int argc, char *argv[])
int regions_overlap = 1;
int targets_overlap = 0;
char *targets_list = NULL, *regions_list = NULL, *tmp;
- while ((c = getopt_long(argc, argv, "?hr:R:t:T:i:e:f:o:O:g:s:S:p:qc:ln:bB:v:W::",loptions,NULL)) >= 0)
+ while ((c = getopt_long(argc, argv, "?hr:R:t:T:i:e:f:o:O:g:s:S:p:qc:C:ln:bB:v:W::",loptions,NULL)) >= 0)
{
switch (c)
{
@@ -3416,11 +3751,13 @@ int main_csq(int argc, char *argv[])
if ( *tmp || args->brief_predictions<1 ) error("Could not parse argument: --trim-protein-seq %s\n", optarg);
break;
case 'l': args->local_csq = 1; break;
+ case 'C': args->gencode_str = optarg; break;
case 'c': args->bcsq_tag = optarg; break;
case 'q': error("Error: the -q option has been deprecated, use -v, --verbose instead.\n"); break;
case 'v':
args->verbosity = atoi(optarg);
- if ( args->verbosity<0 || args->verbosity>2 ) error("Error: expected integer 0-2 with -v, --verbose\n");
+ if ( args->verbosity<0 ) error("Error: expected integer with -v, --verbosity\n");
+ if ( args->verbosity > 3 ) hts_verbose = args->verbosity;
break;
case 'p':
switch (optarg[0])
@@ -3484,16 +3821,14 @@ int main_csq(int argc, char *argv[])
error("Unsupported index format '%s'\n", optarg);
break;
case 7 : args->dump_gff = optarg; break;
- case 8 :
- if ( !strcmp(optarg,"0") ) args->unify_chr_names = 0;
- else if ( !strcmp(optarg,"1") ) args->unify_chr_names = 1;
- else error("Could not parse: --unify-chr-names %s\n",optarg);
- break;
+ case 8 : args->unify_chr_names = optarg; break;
case 'h':
case '?': error("%s",usage());
default: error("The option not recognised: %s\n\n", optarg); break;
}
}
+ init_gencode(args);
+
char *fname = NULL;
if ( optind==argc )
{
diff --git a/bcftools/filter.c b/bcftools/filter.c
index c9dcd023..2e74f0a2 100644
--- a/bcftools/filter.c
+++ b/bcftools/filter.c
@@ -1,6 +1,6 @@
/* filter.c -- filter expressions.
- Copyright (C) 2013-2024 Genome Research Ltd.
+ Copyright (C) 2013-2025 Genome Research Ltd.
Author: Petr Danecek
@@ -73,7 +73,7 @@ typedef struct _token_t
char *tag; // for debugging and printout only, VCF tag name
double threshold; // filtering threshold
int is_constant; // the threshold is set
- int hdr_id, hl_type, ht_type; // BCF header lookup ID and one of BCF_HL_* types and BCF_HT_* types
+ int hdr_id, hl_type, ht_type, vl_len; // BCF header lookup ID and one of BCF_HL_*, BCF_HT_*, BCF_VL_* types
int idx; // 0-based index to VCF vectors,
// -2: list (e.g. [0,1,2] or [1..3] or [1..] or any field[*], which is equivalent to [0..])
// -3: select indices on the fly based on values in GT
@@ -167,6 +167,7 @@ struct _filter_t
#define TOK_NOT_IN 39 // does not contain, e.g. FILTER!~"A"
#define TOK_MODULO 40 // %
#define TOK_EXT 41 // external values set before each filter_test_ext() call, can be one of {},{str},{int},{float}
+#define TOK_FISHER 42
// 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41
// ( ) [ < = > ] ! | & + - * / M m a A O ~ ^ S . l f c p b P i s %
@@ -219,9 +220,10 @@ static int filters_next_token(char **str, int *len)
if ( !strncasecmp(tmp,"STDEV(",6) ) { (*str) += 5; return TOK_STDEV; }
if ( !strncasecmp(tmp,"SUM(",4) ) { (*str) += 3; return TOK_SUM; }
if ( !strncasecmp(tmp,"ABS(",4) ) { (*str) += 3; return TOK_ABS; }
- if ( !strncasecmp(tmp,"COUNT(",4) ) { (*str) += 5; return TOK_CNT; }
+ if ( !strncasecmp(tmp,"COUNT(",6) ) { (*str) += 5; return TOK_CNT; }
if ( !strncasecmp(tmp,"STRLEN(",7) ) { (*str) += 6; return TOK_LEN; }
if ( !strncasecmp(tmp,"BINOM(",6) ) { (*str) += 5; return -TOK_BINOM; }
+ if ( !strncasecmp(tmp,"FISHER(",6) ) { (*str) += 6; return -TOK_FISHER; }
if ( !strncasecmp(tmp,"PHRED(",6) ) { (*str) += 5; return TOK_PHRED; }
if ( !strncasecmp(tmp,"%MAX(",5) ) { (*str) += 4; return TOK_MAX; } // for backward compatibility
if ( !strncasecmp(tmp,"%MIN(",5) ) { (*str) += 4; return TOK_MIN; } // for backward compatibility
@@ -324,10 +326,6 @@ static int filters_next_token(char **str, int *len)
return TOK_VAL;
}
-#define FILTER_OK 0
-#define FILTER_ERR_UNKN_TAGS 1
-#define FILTER_ERR_OTHER 2
-
static void filter_add_undef_tag(filter_t *filter, char *str)
{
int i;
@@ -1191,12 +1189,9 @@ static void filters_set_format_string(filter_t *flt, bcf1_t *line, token_t *tok)
}
static void _filters_set_genotype(filter_t *flt, bcf1_t *line, token_t *tok, int type)
{
+ tok->nvalues = tok->str_value.l = 0;
bcf_fmt_t *fmt = bcf_get_fmt(flt->hdr, line, "GT");
- if ( !fmt )
- {
- tok->nvalues = tok->str_value.l = 0;
- return;
- }
+ if ( !fmt ) return;
int i,j, nsmpl = bcf_hdr_nsamples(flt->hdr), nvals1 = type==2 ? 3 : 4;
if ( tok->str_value.m <= nvals1*nsmpl )
@@ -1276,12 +1271,10 @@ static void filters_set_genotype4(filter_t *flt, bcf1_t *line, token_t *tok) { _
static void filters_set_genotype_string(filter_t *flt, bcf1_t *line, token_t *tok)
{
+ tok->nvalues = tok->str_value.l = 0;
bcf_fmt_t *fmt = bcf_get_fmt(flt->hdr, line, "GT");
- if ( !fmt )
- {
- tok->nvalues = 0;
- return;
- }
+ if ( !fmt ) return;
+
int i, blen = 4, nsmpl = line->n_sample;
gt_length_too_big:
@@ -2036,6 +2029,154 @@ static int func_strlen(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **sta
}
return 1;
}
+static int func_fisher(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack)
+{
+ int i, istack = nstack - rtok->nargs;
+ token_t *tok = stack[istack];
+ token_t *tok2 = istack+2==nstack ? stack[istack+1] : NULL;
+ if ( !tok->nsamples )
+ {
+ // INFO tag, such as DP4
+ rtok->nvalues = 1;
+ hts_expand(double, rtok->nvalues, rtok->mvalues, rtok->values);
+ double *n11 = NULL, *n12 = NULL, *n21 = NULL, *n22 = NULL;
+ if ( istack+1==nstack )
+ {
+ // only one tag, expecting four values: binom(INFO/DP4)
+ if ( tok->nvalues==4 )
+ {
+ n11 = &tok->values[0];
+ n12 = &tok->values[1];
+ n21 = &tok->values[2];
+ n22 = &tok->values[3];
+ }
+ }
+ else if ( istack+2==nstack )
+ {
+ // two tags, expecting two values in each: binom(INFO/ADF[0,2],INFO/ADR[0,2])
+ if ( tok->nvalues>=2 && tok2->nvalues>=2 )
+ {
+ n11 = &tok->values[0];
+ n21 = &tok->values[1];
+ n12 = &tok2->values[0];
+ n22 = &tok2->values[1];
+ }
+ }
+ if ( !n11 || !n12 || !n21 || !n22
+ || bcf_double_is_missing_or_vector_end(n11[0])
+ || bcf_double_is_missing_or_vector_end(n12[0])
+ || bcf_double_is_missing_or_vector_end(n21[0])
+ || bcf_double_is_missing_or_vector_end(n22[0]) ) bcf_double_set_missing(rtok->values[0]);
+ else
+ {
+ double left,right,two;
+ kt_fisher_exact(n11[0],n12[0],n21[0],n22[0], &left, &right, &two);
+ rtok->values[0] = two;
+ }
+ }
+ else
+ {
+ rtok->nval1 = 1;
+ rtok->nvalues = tok->nsamples;
+ rtok->nsamples = tok->nsamples;
+ hts_expand(double, rtok->nvalues, rtok->mvalues, rtok->values);
+ assert(tok->usmpl);
+ if ( !rtok->usmpl ) rtok->usmpl = (uint8_t*) malloc(tok->nsamples);
+ memcpy(rtok->usmpl, tok->usmpl, tok->nsamples);
+
+ if ( istack+1==nstack && tok->nval1==4 )
+ {
+ // only one tag, expecting four values: fisher(FORMAT/DP4)
+ for (i=0; insamples; i++)
+ {
+ if ( !rtok->usmpl[i] ) continue;
+ double *n11 = tok->values + tok->nval1*i;
+ double *n12 = tok->values + tok->nval1*i+1;
+ double *n21 = tok->values + tok->nval1*i+2;
+ double *n22 = tok->values + tok->nval1*i+3;
+ if ( !n11 || !n12 || !n21 || !n22
+ || bcf_double_is_missing_or_vector_end(n11[0])
+ || bcf_double_is_missing_or_vector_end(n12[0])
+ || bcf_double_is_missing_or_vector_end(n21[0])
+ || bcf_double_is_missing_or_vector_end(n22[0]) ) bcf_double_set_missing(rtok->values[i]);
+ else
+ {
+ double left,right,two;
+ kt_fisher_exact(n11[0],n12[0],n21[0],n22[0], &left, &right, &two);
+ rtok->values[i] = two;
+ }
+ }
+ }
+ else if ( istack+2==nstack && tok->vl_len==BCF_VL_R && tok2->vl_len==BCF_VL_R && tok->nuidxs+tok2->nuidxs==4 )
+ {
+ // two Number=R tags with explicit indices, e.g. fisher(FORMAT/ADF[:0,1],FORMAT/ADR[:0,1])
+ for (i=0; insamples; i++)
+ {
+ if ( !rtok->usmpl[i] ) continue;
+ double *n11 = tok->values + tok->nval1*i;
+ double *n12 = tok->values + tok->nval1*i+1;
+ double *n21 = tok2->values + tok2->nval1*i;
+ double *n22 = tok2->values + tok2->nval1*i+1;
+ if ( !n11 || !n12 || !n21 || !n22
+ || bcf_double_is_missing_or_vector_end(n11[0])
+ || bcf_double_is_missing_or_vector_end(n12[0])
+ || bcf_double_is_missing_or_vector_end(n21[0])
+ || bcf_double_is_missing_or_vector_end(n22[0]) ) bcf_double_set_missing(rtok->values[i]);
+ else
+ {
+ double left,right,two;
+ kt_fisher_exact(n11[0],n12[0],n21[0],n22[0], &left, &right, &two);
+ rtok->values[i] = two;
+ }
+ }
+ }
+ else if ( istack+2==nstack && tok->vl_len==BCF_VL_R && tok2->vl_len==BCF_VL_R )
+ {
+ // two Number=R tags, fisher(FORMAT/ADF,FORMAT/ADR), take thae ALT allele index from GT
+ int ngt = bcf_get_genotypes(flt->hdr, line, &flt->tmpi, &flt->mtmpi);
+ int max_ploidy = ngt/line->n_sample;
+ if ( ngt <= 0 || max_ploidy < 2 ) // GT not present or not diploid, cannot set
+ {
+ for (i=0; insamples; i++)
+ if ( rtok->usmpl[i] ) bcf_double_set_missing(rtok->values[i]);
+ return rtok->nargs;
+ }
+ for (i=0; insamples; i++)
+ {
+ if ( !rtok->usmpl[i] ) continue;
+ int32_t *ptr = flt->tmpi + i*max_ploidy;
+ if ( bcf_gt_is_missing(ptr[0]) || bcf_gt_is_missing(ptr[1]) || ptr[1]==bcf_int32_vector_end )
+ {
+ bcf_double_set_missing(rtok->values[i]);
+ continue;
+ }
+ int idx1 = bcf_gt_allele(ptr[0]);
+ int idx2 = bcf_gt_allele(ptr[1]);
+ if ( idx1>=line->n_allele ) error("Incorrect allele index at %s:%"PRId64", sample %s\n", bcf_seqname(flt->hdr,line),(int64_t) line->pos+1,flt->hdr->samples[i]);
+ if ( idx2>=line->n_allele ) error("Incorrect allele index at %s:%"PRId64", sample %s\n", bcf_seqname(flt->hdr,line),(int64_t) line->pos+1,flt->hdr->samples[i]);
+ double *vals = tok->values + tok->nval1*i;
+ double *vals2 = tok2->values + tok2->nval1*i;
+ double *n11 = &vals[idx1];
+ double *n12 = &vals[idx2];
+ double *n21 = &vals2[idx1];
+ double *n22 = &vals2[idx2];
+ if ( !n11 || !n12 || !n21 || !n22
+ || bcf_double_is_missing_or_vector_end(n11[0])
+ || bcf_double_is_missing_or_vector_end(n12[0])
+ || bcf_double_is_missing_or_vector_end(n21[0])
+ || bcf_double_is_missing_or_vector_end(n22[0]) )
+ {
+ bcf_double_set_missing(rtok->values[i]);
+ continue;
+ }
+ double left,right,two;
+ kt_fisher_exact(n11[0],n12[0],n21[0],n22[0], &left, &right, &two);
+ rtok->values[i] = two;
+ }
+ }
+ }
+ return rtok->nargs;
+}
static int func_binom(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack)
{
int i, istack = nstack - rtok->nargs;
@@ -2181,7 +2322,6 @@ static int func_phred(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stac
rtok->nsamples = tok->nsamples;
rtok->nval1 = tok->nval1;
memcpy(rtok->pass_samples, tok->pass_samples, rtok->nsamples*sizeof(*rtok->pass_samples));
- assert(tok->usmpl);
if ( !rtok->usmpl )
{
rtok->usmpl = (uint8_t*) malloc(tok->nsamples*sizeof(*rtok->usmpl));
@@ -2618,9 +2758,11 @@ static int _regex_vector_strings(regex_t *regex, char *str, size_t len, int logi
char *mid = str;
while ( mid < end && *mid && *mid!=',' ) mid++;
int miss = mid - str == 1 && str[0]=='.' ? 1 : 0;
- if ( miss && missing_logic[miss] ) return 1;
+ int match = ( miss && missing_logic[miss] ) ? 1 : 0;
+ if ( logic==TOK_NLIKE ) match = match ? 0 : 1;
+ if ( match ) return 1;
char tmp = *mid; *mid = 0;
- int match = regexec(regex, str, 0,NULL,0) ? 0 : 1;
+ match = regexec(regex, str, 0,NULL,0) ? 0 : 1;
*mid = tmp;
if ( logic==TOK_NLIKE ) match = match ? 0 : 1;
if ( match ) return 1;
@@ -2707,6 +2849,7 @@ static void cmp_vector_strings(token_t *atok, token_t *btok, token_t *rtok)
{
token_t *tok = atok->regex ? btok : atok;
rtok->pass_site = _regex_vector_strings(regex, tok->str_value.s, tok->str_value.l, logic, missing_logic);
+ fprintf(stderr,"pass=%d [%s]\n",rtok->pass_site,tok->str_value.s);
}
return;
}
@@ -2955,6 +3098,7 @@ static int max_ac_an_unpack(bcf_hdr_t *hdr)
}
static int filters_init1_ext(filter_t *filter, char *str, int len, token_t *tok)
{
+ tok->vl_len = BCF_VL_FIXED;
tok->hl_type = -1;
tok->ht_type = -1;
tok->tok_type = TOK_VAL;
@@ -2971,6 +3115,7 @@ static int filters_init1_ext(filter_t *filter, char *str, int len, token_t *tok)
}
static int filters_init1(filter_t *filter, char *str, int len, token_t *tok)
{
+ tok->vl_len = BCF_VL_FIXED;
tok->ht_type = -1;
tok->hl_type = -1;
tok->tok_type = TOK_VAL;
@@ -3168,6 +3313,7 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok)
}
tok->hl_type = is_fmt ? BCF_HL_FMT : BCF_HL_INFO;
+ if ( tok->hdr_id >= 0 ) tok->vl_len = bcf_hdr_id2length(filter->hdr,tok->hl_type,tok->hdr_id);
if ( is_fmt ) filter->max_unpack |= BCF_UN_FMT;
if ( tok->hdr_id>=0 )
{
@@ -3502,6 +3648,49 @@ static void determine_ext_types(filter_t *filter, int ntok, token_t *tok)
}
}
+// Same as hts_readlist but recognizes brackets, () and [], and does not split by comma inside
+// Possible todo: the escaping is simplistic, does not check the syntax, so "([)]" is not distinguishable from "([])"
+char **parse_tag_list(const char *string, int *_n)
+{
+ *_n = 0;
+ unsigned int m = 0, n = 0;
+ char **s = 0, **s_new;
+ const char *q = string, *p = string;
+ int escape_bracket = 0;
+ while ( 1 )
+ {
+ if ((*p == ',' && !escape_bracket) || *p == 0)
+ {
+ if (hts_resize(char*, n + 1, &m, &s, 0) < 0)
+ goto err;
+ s[n] = (char*)calloc(p - q + 1, 1);
+ if (!s[n])
+ goto err;
+ strncpy(s[n++], q, p - q);
+ q = p + 1;
+ }
+ if ( !*p ) break;
+ if ( *p=='[' || *p=='(' ) escape_bracket++;
+ if ( (*p==']' || *p==')') && escape_bracket ) escape_bracket--;
+ p++;
+ }
+
+ // Try to shrink s to the minimum size needed
+ s_new = (char**)realloc(s, n * sizeof(char*));
+ if (!s_new)
+ goto err;
+
+ s = s_new;
+ assert(n < INT_MAX); // hts_resize() should ensure this
+ *_n = n;
+ return s;
+
+err:
+ for (m = 0; m < n; m++)
+ free(s[m]);
+ free(s);
+ return NULL;
+}
// Parse filter expression and convert to reverse polish notation. Dijkstra's shunting-yard algorithm
static filter_t *filter_init_(bcf_hdr_t *hdr, const char *str, int exit_on_error)
@@ -3573,7 +3762,7 @@ static filter_t *filter_init_(bcf_hdr_t *hdr, const char *str, int exit_on_error
}
else if ( ret == -TOK_FUNC )
{
- // this is different from TOK_PERLSUB,TOK_BINOM in that the expression inside the
+ // this is different from TOK_PERLSUB,TOK_BINOM,TOK_FISHER in that the expression inside the
// brackets gets evaluated as normal expression
nops++;
hts_expand0(token_t, nops, mops, ops);
@@ -3597,7 +3786,7 @@ static filter_t *filter_init_(bcf_hdr_t *hdr, const char *str, int exit_on_error
else error("The function \"%s\" is not supported\n", tmp-len);
continue;
}
- else if ( ret < 0 ) // variable number of arguments: TOK_PERLSUB,TOK_BINOM
+ else if ( ret < 0 ) // variable number of arguments: TOK_PERLSUB,TOK_BINOM,TOK_FISHER
{
ret = -ret;
@@ -3609,7 +3798,7 @@ static filter_t *filter_init_(bcf_hdr_t *hdr, const char *str, int exit_on_error
if ( ret == TOK_PERLSUB )
{
while ( *beg && ((isalnum(*beg) && !ispunct(*beg)) || *beg=='_') ) beg++;
- if ( *beg!='(' ) error("Could not parse the expression: %s\n", str);
+ if ( *beg!='(' ) error("[%s:%d] Could not parse the expression: %s\n", __FILE__,__LINE__,str);
// the subroutine name
kputc('"', &rmme);
@@ -3622,12 +3811,12 @@ static filter_t *filter_init_(bcf_hdr_t *hdr, const char *str, int exit_on_error
}
char *end = beg;
while ( *end && *end!=')' ) end++;
- if ( !*end ) error("Could not parse the expression: %s\n", str);
+ if ( !*end ) error("[%s:%d] Could not parse the expression: %s\n", __FILE__,__LINE__,str);
// subroutine arguments
rmme.l = 0;
kputsn(beg+1, end-beg-1, &rmme);
- char **rmme_list = hts_readlist(rmme.s, 0, &margs);
+ char **rmme_list = parse_tag_list(rmme.s, &margs);
for (i=0; iexit_on_error )
+ error("[%s:%d %s] Error: could not parse the expression \"%s\"\n", __FILE__,__LINE__,__FUNCTION__,filter->str);
+ filter->status |= FILTER_ERR_OTHER;
+ }
if ( filter->status != FILTER_OK )
{
if ( mops ) free(ops);
@@ -3745,6 +3940,7 @@ static filter_t *filter_init_(bcf_hdr_t *hdr, const char *str, int exit_on_error
if ( type==BCF_HT_INT ) set_missing = 1;
else if ( type==BCF_HT_REAL ) set_missing = 1;
}
+ else if ( !out[k].tag ) error("Error: could not parse the expression\n"); // e.g. =~
else if ( !strcmp("QUAL",out[k].tag) ) set_missing = 1;
if ( set_missing ) { out[j].is_str = 0; out[j].is_missing = 1; bcf_double_set_missing(out[j].values[0]); }
}
@@ -3890,6 +4086,7 @@ static filter_t *filter_init_(bcf_hdr_t *hdr, const char *str, int exit_on_error
else if ( out[i].tok_type==TOK_LEN ) { out[i].func = func_strlen; out[i].tok_type = TOK_FUNC; }
else if ( out[i].tok_type==TOK_PHRED ) { out[i].func = func_phred; out[i].tok_type = TOK_FUNC; }
else if ( out[i].tok_type==TOK_BINOM ) { out[i].func = func_binom; out[i].tok_type = TOK_FUNC; }
+ else if ( out[i].tok_type==TOK_FISHER ) { out[i].func = func_fisher; out[i].tok_type = TOK_FUNC; }
else if ( out[i].tok_type==TOK_PERLSUB ) { out[i].func = perl_exec; out[i].tok_type = TOK_FUNC; }
else if ( out[i].tok_type==TOK_sMAX ) { out[i].func = func_smpl_max; out[i].tok_type = TOK_FUNC; }
else if ( out[i].tok_type==TOK_sMIN ) { out[i].func = func_smpl_min; out[i].tok_type = TOK_FUNC; }
diff --git a/bcftools/filter.c.pysam.c b/bcftools/filter.c.pysam.c
index 2db56801..e3d14aff 100644
--- a/bcftools/filter.c.pysam.c
+++ b/bcftools/filter.c.pysam.c
@@ -2,7 +2,7 @@
/* filter.c -- filter expressions.
- Copyright (C) 2013-2024 Genome Research Ltd.
+ Copyright (C) 2013-2025 Genome Research Ltd.
Author: Petr Danecek
@@ -75,7 +75,7 @@ typedef struct _token_t
char *tag; // for debugging and printout only, VCF tag name
double threshold; // filtering threshold
int is_constant; // the threshold is set
- int hdr_id, hl_type, ht_type; // BCF header lookup ID and one of BCF_HL_* types and BCF_HT_* types
+ int hdr_id, hl_type, ht_type, vl_len; // BCF header lookup ID and one of BCF_HL_*, BCF_HT_*, BCF_VL_* types
int idx; // 0-based index to VCF vectors,
// -2: list (e.g. [0,1,2] or [1..3] or [1..] or any field[*], which is equivalent to [0..])
// -3: select indices on the fly based on values in GT
@@ -169,6 +169,7 @@ struct _filter_t
#define TOK_NOT_IN 39 // does not contain, e.g. FILTER!~"A"
#define TOK_MODULO 40 // %
#define TOK_EXT 41 // external values set before each filter_test_ext() call, can be one of {},{str},{int},{float}
+#define TOK_FISHER 42
// 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41
// ( ) [ < = > ] ! | & + - * / M m a A O ~ ^ S . l f c p b P i s %
@@ -221,9 +222,10 @@ static int filters_next_token(char **str, int *len)
if ( !strncasecmp(tmp,"STDEV(",6) ) { (*str) += 5; return TOK_STDEV; }
if ( !strncasecmp(tmp,"SUM(",4) ) { (*str) += 3; return TOK_SUM; }
if ( !strncasecmp(tmp,"ABS(",4) ) { (*str) += 3; return TOK_ABS; }
- if ( !strncasecmp(tmp,"COUNT(",4) ) { (*str) += 5; return TOK_CNT; }
+ if ( !strncasecmp(tmp,"COUNT(",6) ) { (*str) += 5; return TOK_CNT; }
if ( !strncasecmp(tmp,"STRLEN(",7) ) { (*str) += 6; return TOK_LEN; }
if ( !strncasecmp(tmp,"BINOM(",6) ) { (*str) += 5; return -TOK_BINOM; }
+ if ( !strncasecmp(tmp,"FISHER(",6) ) { (*str) += 6; return -TOK_FISHER; }
if ( !strncasecmp(tmp,"PHRED(",6) ) { (*str) += 5; return TOK_PHRED; }
if ( !strncasecmp(tmp,"%MAX(",5) ) { (*str) += 4; return TOK_MAX; } // for backward compatibility
if ( !strncasecmp(tmp,"%MIN(",5) ) { (*str) += 4; return TOK_MIN; } // for backward compatibility
@@ -326,10 +328,6 @@ static int filters_next_token(char **str, int *len)
return TOK_VAL;
}
-#define FILTER_OK 0
-#define FILTER_ERR_UNKN_TAGS 1
-#define FILTER_ERR_OTHER 2
-
static void filter_add_undef_tag(filter_t *filter, char *str)
{
int i;
@@ -1193,12 +1191,9 @@ static void filters_set_format_string(filter_t *flt, bcf1_t *line, token_t *tok)
}
static void _filters_set_genotype(filter_t *flt, bcf1_t *line, token_t *tok, int type)
{
+ tok->nvalues = tok->str_value.l = 0;
bcf_fmt_t *fmt = bcf_get_fmt(flt->hdr, line, "GT");
- if ( !fmt )
- {
- tok->nvalues = tok->str_value.l = 0;
- return;
- }
+ if ( !fmt ) return;
int i,j, nsmpl = bcf_hdr_nsamples(flt->hdr), nvals1 = type==2 ? 3 : 4;
if ( tok->str_value.m <= nvals1*nsmpl )
@@ -1278,12 +1273,10 @@ static void filters_set_genotype4(filter_t *flt, bcf1_t *line, token_t *tok) { _
static void filters_set_genotype_string(filter_t *flt, bcf1_t *line, token_t *tok)
{
+ tok->nvalues = tok->str_value.l = 0;
bcf_fmt_t *fmt = bcf_get_fmt(flt->hdr, line, "GT");
- if ( !fmt )
- {
- tok->nvalues = 0;
- return;
- }
+ if ( !fmt ) return;
+
int i, blen = 4, nsmpl = line->n_sample;
gt_length_too_big:
@@ -2038,6 +2031,154 @@ static int func_strlen(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **sta
}
return 1;
}
+static int func_fisher(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack)
+{
+ int i, istack = nstack - rtok->nargs;
+ token_t *tok = stack[istack];
+ token_t *tok2 = istack+2==nstack ? stack[istack+1] : NULL;
+ if ( !tok->nsamples )
+ {
+ // INFO tag, such as DP4
+ rtok->nvalues = 1;
+ hts_expand(double, rtok->nvalues, rtok->mvalues, rtok->values);
+ double *n11 = NULL, *n12 = NULL, *n21 = NULL, *n22 = NULL;
+ if ( istack+1==nstack )
+ {
+ // only one tag, expecting four values: binom(INFO/DP4)
+ if ( tok->nvalues==4 )
+ {
+ n11 = &tok->values[0];
+ n12 = &tok->values[1];
+ n21 = &tok->values[2];
+ n22 = &tok->values[3];
+ }
+ }
+ else if ( istack+2==nstack )
+ {
+ // two tags, expecting two values in each: binom(INFO/ADF[0,2],INFO/ADR[0,2])
+ if ( tok->nvalues>=2 && tok2->nvalues>=2 )
+ {
+ n11 = &tok->values[0];
+ n21 = &tok->values[1];
+ n12 = &tok2->values[0];
+ n22 = &tok2->values[1];
+ }
+ }
+ if ( !n11 || !n12 || !n21 || !n22
+ || bcf_double_is_missing_or_vector_end(n11[0])
+ || bcf_double_is_missing_or_vector_end(n12[0])
+ || bcf_double_is_missing_or_vector_end(n21[0])
+ || bcf_double_is_missing_or_vector_end(n22[0]) ) bcf_double_set_missing(rtok->values[0]);
+ else
+ {
+ double left,right,two;
+ kt_fisher_exact(n11[0],n12[0],n21[0],n22[0], &left, &right, &two);
+ rtok->values[0] = two;
+ }
+ }
+ else
+ {
+ rtok->nval1 = 1;
+ rtok->nvalues = tok->nsamples;
+ rtok->nsamples = tok->nsamples;
+ hts_expand(double, rtok->nvalues, rtok->mvalues, rtok->values);
+ assert(tok->usmpl);
+ if ( !rtok->usmpl ) rtok->usmpl = (uint8_t*) malloc(tok->nsamples);
+ memcpy(rtok->usmpl, tok->usmpl, tok->nsamples);
+
+ if ( istack+1==nstack && tok->nval1==4 )
+ {
+ // only one tag, expecting four values: fisher(FORMAT/DP4)
+ for (i=0; insamples; i++)
+ {
+ if ( !rtok->usmpl[i] ) continue;
+ double *n11 = tok->values + tok->nval1*i;
+ double *n12 = tok->values + tok->nval1*i+1;
+ double *n21 = tok->values + tok->nval1*i+2;
+ double *n22 = tok->values + tok->nval1*i+3;
+ if ( !n11 || !n12 || !n21 || !n22
+ || bcf_double_is_missing_or_vector_end(n11[0])
+ || bcf_double_is_missing_or_vector_end(n12[0])
+ || bcf_double_is_missing_or_vector_end(n21[0])
+ || bcf_double_is_missing_or_vector_end(n22[0]) ) bcf_double_set_missing(rtok->values[i]);
+ else
+ {
+ double left,right,two;
+ kt_fisher_exact(n11[0],n12[0],n21[0],n22[0], &left, &right, &two);
+ rtok->values[i] = two;
+ }
+ }
+ }
+ else if ( istack+2==nstack && tok->vl_len==BCF_VL_R && tok2->vl_len==BCF_VL_R && tok->nuidxs+tok2->nuidxs==4 )
+ {
+ // two Number=R tags with explicit indices, e.g. fisher(FORMAT/ADF[:0,1],FORMAT/ADR[:0,1])
+ for (i=0; insamples; i++)
+ {
+ if ( !rtok->usmpl[i] ) continue;
+ double *n11 = tok->values + tok->nval1*i;
+ double *n12 = tok->values + tok->nval1*i+1;
+ double *n21 = tok2->values + tok2->nval1*i;
+ double *n22 = tok2->values + tok2->nval1*i+1;
+ if ( !n11 || !n12 || !n21 || !n22
+ || bcf_double_is_missing_or_vector_end(n11[0])
+ || bcf_double_is_missing_or_vector_end(n12[0])
+ || bcf_double_is_missing_or_vector_end(n21[0])
+ || bcf_double_is_missing_or_vector_end(n22[0]) ) bcf_double_set_missing(rtok->values[i]);
+ else
+ {
+ double left,right,two;
+ kt_fisher_exact(n11[0],n12[0],n21[0],n22[0], &left, &right, &two);
+ rtok->values[i] = two;
+ }
+ }
+ }
+ else if ( istack+2==nstack && tok->vl_len==BCF_VL_R && tok2->vl_len==BCF_VL_R )
+ {
+ // two Number=R tags, fisher(FORMAT/ADF,FORMAT/ADR), take thae ALT allele index from GT
+ int ngt = bcf_get_genotypes(flt->hdr, line, &flt->tmpi, &flt->mtmpi);
+ int max_ploidy = ngt/line->n_sample;
+ if ( ngt <= 0 || max_ploidy < 2 ) // GT not present or not diploid, cannot set
+ {
+ for (i=0; insamples; i++)
+ if ( rtok->usmpl[i] ) bcf_double_set_missing(rtok->values[i]);
+ return rtok->nargs;
+ }
+ for (i=0; insamples; i++)
+ {
+ if ( !rtok->usmpl[i] ) continue;
+ int32_t *ptr = flt->tmpi + i*max_ploidy;
+ if ( bcf_gt_is_missing(ptr[0]) || bcf_gt_is_missing(ptr[1]) || ptr[1]==bcf_int32_vector_end )
+ {
+ bcf_double_set_missing(rtok->values[i]);
+ continue;
+ }
+ int idx1 = bcf_gt_allele(ptr[0]);
+ int idx2 = bcf_gt_allele(ptr[1]);
+ if ( idx1>=line->n_allele ) error("Incorrect allele index at %s:%"PRId64", sample %s\n", bcf_seqname(flt->hdr,line),(int64_t) line->pos+1,flt->hdr->samples[i]);
+ if ( idx2>=line->n_allele ) error("Incorrect allele index at %s:%"PRId64", sample %s\n", bcf_seqname(flt->hdr,line),(int64_t) line->pos+1,flt->hdr->samples[i]);
+ double *vals = tok->values + tok->nval1*i;
+ double *vals2 = tok2->values + tok2->nval1*i;
+ double *n11 = &vals[idx1];
+ double *n12 = &vals[idx2];
+ double *n21 = &vals2[idx1];
+ double *n22 = &vals2[idx2];
+ if ( !n11 || !n12 || !n21 || !n22
+ || bcf_double_is_missing_or_vector_end(n11[0])
+ || bcf_double_is_missing_or_vector_end(n12[0])
+ || bcf_double_is_missing_or_vector_end(n21[0])
+ || bcf_double_is_missing_or_vector_end(n22[0]) )
+ {
+ bcf_double_set_missing(rtok->values[i]);
+ continue;
+ }
+ double left,right,two;
+ kt_fisher_exact(n11[0],n12[0],n21[0],n22[0], &left, &right, &two);
+ rtok->values[i] = two;
+ }
+ }
+ }
+ return rtok->nargs;
+}
static int func_binom(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack)
{
int i, istack = nstack - rtok->nargs;
@@ -2183,7 +2324,6 @@ static int func_phred(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stac
rtok->nsamples = tok->nsamples;
rtok->nval1 = tok->nval1;
memcpy(rtok->pass_samples, tok->pass_samples, rtok->nsamples*sizeof(*rtok->pass_samples));
- assert(tok->usmpl);
if ( !rtok->usmpl )
{
rtok->usmpl = (uint8_t*) malloc(tok->nsamples*sizeof(*rtok->usmpl));
@@ -2620,9 +2760,11 @@ static int _regex_vector_strings(regex_t *regex, char *str, size_t len, int logi
char *mid = str;
while ( mid < end && *mid && *mid!=',' ) mid++;
int miss = mid - str == 1 && str[0]=='.' ? 1 : 0;
- if ( miss && missing_logic[miss] ) return 1;
+ int match = ( miss && missing_logic[miss] ) ? 1 : 0;
+ if ( logic==TOK_NLIKE ) match = match ? 0 : 1;
+ if ( match ) return 1;
char tmp = *mid; *mid = 0;
- int match = regexec(regex, str, 0,NULL,0) ? 0 : 1;
+ match = regexec(regex, str, 0,NULL,0) ? 0 : 1;
*mid = tmp;
if ( logic==TOK_NLIKE ) match = match ? 0 : 1;
if ( match ) return 1;
@@ -2709,6 +2851,7 @@ static void cmp_vector_strings(token_t *atok, token_t *btok, token_t *rtok)
{
token_t *tok = atok->regex ? btok : atok;
rtok->pass_site = _regex_vector_strings(regex, tok->str_value.s, tok->str_value.l, logic, missing_logic);
+ fprintf(bcftools_stderr,"pass=%d [%s]\n",rtok->pass_site,tok->str_value.s);
}
return;
}
@@ -2957,6 +3100,7 @@ static int max_ac_an_unpack(bcf_hdr_t *hdr)
}
static int filters_init1_ext(filter_t *filter, char *str, int len, token_t *tok)
{
+ tok->vl_len = BCF_VL_FIXED;
tok->hl_type = -1;
tok->ht_type = -1;
tok->tok_type = TOK_VAL;
@@ -2973,6 +3117,7 @@ static int filters_init1_ext(filter_t *filter, char *str, int len, token_t *tok)
}
static int filters_init1(filter_t *filter, char *str, int len, token_t *tok)
{
+ tok->vl_len = BCF_VL_FIXED;
tok->ht_type = -1;
tok->hl_type = -1;
tok->tok_type = TOK_VAL;
@@ -3170,6 +3315,7 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok)
}
tok->hl_type = is_fmt ? BCF_HL_FMT : BCF_HL_INFO;
+ if ( tok->hdr_id >= 0 ) tok->vl_len = bcf_hdr_id2length(filter->hdr,tok->hl_type,tok->hdr_id);
if ( is_fmt ) filter->max_unpack |= BCF_UN_FMT;
if ( tok->hdr_id>=0 )
{
@@ -3504,6 +3650,49 @@ static void determine_ext_types(filter_t *filter, int ntok, token_t *tok)
}
}
+// Same as hts_readlist but recognizes brackets, () and [], and does not split by comma inside
+// Possible todo: the escaping is simplistic, does not check the syntax, so "([)]" is not distinguishable from "([])"
+char **parse_tag_list(const char *string, int *_n)
+{
+ *_n = 0;
+ unsigned int m = 0, n = 0;
+ char **s = 0, **s_new;
+ const char *q = string, *p = string;
+ int escape_bracket = 0;
+ while ( 1 )
+ {
+ if ((*p == ',' && !escape_bracket) || *p == 0)
+ {
+ if (hts_resize(char*, n + 1, &m, &s, 0) < 0)
+ goto err;
+ s[n] = (char*)calloc(p - q + 1, 1);
+ if (!s[n])
+ goto err;
+ strncpy(s[n++], q, p - q);
+ q = p + 1;
+ }
+ if ( !*p ) break;
+ if ( *p=='[' || *p=='(' ) escape_bracket++;
+ if ( (*p==']' || *p==')') && escape_bracket ) escape_bracket--;
+ p++;
+ }
+
+ // Try to shrink s to the minimum size needed
+ s_new = (char**)realloc(s, n * sizeof(char*));
+ if (!s_new)
+ goto err;
+
+ s = s_new;
+ assert(n < INT_MAX); // hts_resize() should ensure this
+ *_n = n;
+ return s;
+
+err:
+ for (m = 0; m < n; m++)
+ free(s[m]);
+ free(s);
+ return NULL;
+}
// Parse filter expression and convert to reverse polish notation. Dijkstra's shunting-yard algorithm
static filter_t *filter_init_(bcf_hdr_t *hdr, const char *str, int exit_on_error)
@@ -3575,7 +3764,7 @@ static filter_t *filter_init_(bcf_hdr_t *hdr, const char *str, int exit_on_error
}
else if ( ret == -TOK_FUNC )
{
- // this is different from TOK_PERLSUB,TOK_BINOM in that the expression inside the
+ // this is different from TOK_PERLSUB,TOK_BINOM,TOK_FISHER in that the expression inside the
// brackets gets evaluated as normal expression
nops++;
hts_expand0(token_t, nops, mops, ops);
@@ -3599,7 +3788,7 @@ static filter_t *filter_init_(bcf_hdr_t *hdr, const char *str, int exit_on_error
else error("The function \"%s\" is not supported\n", tmp-len);
continue;
}
- else if ( ret < 0 ) // variable number of arguments: TOK_PERLSUB,TOK_BINOM
+ else if ( ret < 0 ) // variable number of arguments: TOK_PERLSUB,TOK_BINOM,TOK_FISHER
{
ret = -ret;
@@ -3611,7 +3800,7 @@ static filter_t *filter_init_(bcf_hdr_t *hdr, const char *str, int exit_on_error
if ( ret == TOK_PERLSUB )
{
while ( *beg && ((isalnum(*beg) && !ispunct(*beg)) || *beg=='_') ) beg++;
- if ( *beg!='(' ) error("Could not parse the expression: %s\n", str);
+ if ( *beg!='(' ) error("[%s:%d] Could not parse the expression: %s\n", __FILE__,__LINE__,str);
// the subroutine name
kputc('"', &rmme);
@@ -3624,12 +3813,12 @@ static filter_t *filter_init_(bcf_hdr_t *hdr, const char *str, int exit_on_error
}
char *end = beg;
while ( *end && *end!=')' ) end++;
- if ( !*end ) error("Could not parse the expression: %s\n", str);
+ if ( !*end ) error("[%s:%d] Could not parse the expression: %s\n", __FILE__,__LINE__,str);
// subroutine arguments
rmme.l = 0;
kputsn(beg+1, end-beg-1, &rmme);
- char **rmme_list = hts_readlist(rmme.s, 0, &margs);
+ char **rmme_list = parse_tag_list(rmme.s, &margs);
for (i=0; iexit_on_error )
+ error("[%s:%d %s] Error: could not parse the expression \"%s\"\n", __FILE__,__LINE__,__FUNCTION__,filter->str);
+ filter->status |= FILTER_ERR_OTHER;
+ }
if ( filter->status != FILTER_OK )
{
if ( mops ) free(ops);
@@ -3747,6 +3942,7 @@ static filter_t *filter_init_(bcf_hdr_t *hdr, const char *str, int exit_on_error
if ( type==BCF_HT_INT ) set_missing = 1;
else if ( type==BCF_HT_REAL ) set_missing = 1;
}
+ else if ( !out[k].tag ) error("Error: could not parse the expression\n"); // e.g. =~
else if ( !strcmp("QUAL",out[k].tag) ) set_missing = 1;
if ( set_missing ) { out[j].is_str = 0; out[j].is_missing = 1; bcf_double_set_missing(out[j].values[0]); }
}
@@ -3892,6 +4088,7 @@ static filter_t *filter_init_(bcf_hdr_t *hdr, const char *str, int exit_on_error
else if ( out[i].tok_type==TOK_LEN ) { out[i].func = func_strlen; out[i].tok_type = TOK_FUNC; }
else if ( out[i].tok_type==TOK_PHRED ) { out[i].func = func_phred; out[i].tok_type = TOK_FUNC; }
else if ( out[i].tok_type==TOK_BINOM ) { out[i].func = func_binom; out[i].tok_type = TOK_FUNC; }
+ else if ( out[i].tok_type==TOK_FISHER ) { out[i].func = func_fisher; out[i].tok_type = TOK_FUNC; }
else if ( out[i].tok_type==TOK_PERLSUB ) { out[i].func = perl_exec; out[i].tok_type = TOK_FUNC; }
else if ( out[i].tok_type==TOK_sMAX ) { out[i].func = func_smpl_max; out[i].tok_type = TOK_FUNC; }
else if ( out[i].tok_type==TOK_sMIN ) { out[i].func = func_smpl_min; out[i].tok_type = TOK_FUNC; }
diff --git a/bcftools/gff.c b/bcftools/gff.c
index 283ced33..119a6912 100644
--- a/bcftools/gff.c
+++ b/bcftools/gff.c
@@ -1,6 +1,6 @@
/* The MIT License
- Copyright (c) 2023 Genome Research Ltd.
+ Copyright (c) 2023-2025 Genome Research Ltd.
Author: Petr Danecek
@@ -87,11 +87,6 @@ typedef struct
// mapping from transcript id to tscript, for quick CDS anchoring
kh_int2tscript_t *id2tr;
- // sequences
- void *seq2int; // str2int hash
- char **seq;
- int nseq, mseq;
-
// ignored biotypes
void *ignored_biotypes;
@@ -111,18 +106,25 @@ struct gff_t_
// index iterator
regidx_t *idx_cds, *idx_utr, *idx_exon, *idx_tscript;
+ // str2int hash with parsed sequence names
+ void *seq2int;
+
// temporary structures, deleted after initializtion
aux_t init;
+ // sequences
+ char **seq;
+ int nseq, mseq;
+
// mapping between transcript id (eg. Zm00001d027245_T001) and a numeric idx
id_tbl_t tscript_ids;
- int strip_chr_names, verbosity;
+ int verbosity;
int force; // force run under various conditions. Currently only to skip out-of-phase transcripts
struct {
int unknown_chr,unknown_tscript_biotype,unknown_strand,unknown_phase,duplicate_id;
- int unknown_cds_phase,incomplete_cds,wrong_phase,overlapping_cds;
+ int unknown_cds_phase,incomplete_cds,wrong_phase,overlapping_cds,ftr_out_of_bounds;
} warned;
};
@@ -158,12 +160,6 @@ int gff_set(gff_t *gff, gff_opt_t key, ...)
va_end(args);
return 0;
- case strip_chr_names:
- va_start(args, key);
- gff->strip_chr_names = va_arg(args,int);
- va_end(args);
- return 0;
-
case verbosity:
va_start(args, key);
gff->verbosity = va_arg(args,int);
@@ -212,18 +208,17 @@ const char *gf_type2gff_string(int type)
*/
static inline int feature_set_seq(gff_t *gff, char *chr_beg, char *chr_end)
{
- aux_t *aux = &gff->init;
char tmp = chr_end[1];
chr_end[1] = 0;
int iseq;
- if ( khash_str2int_get(aux->seq2int, chr_beg, &iseq)!=0 )
+ if ( khash_str2int_get(gff->seq2int, chr_beg, &iseq)!=0 )
{
char *new_chr = strdup(chr_beg);
- hts_expand(char*, aux->nseq+1, aux->mseq, aux->seq);
- aux->seq[aux->nseq] = new_chr;
- iseq = khash_str2int_inc(aux->seq2int, aux->seq[aux->nseq]);
- aux->nseq++;
- assert( aux->nseq < 1<<29 ); // see gf_gene_t.iseq and ftr_t.iseq
+ hts_expand(char*, gff->nseq+1, gff->mseq, gff->seq);
+ gff->seq[gff->nseq] = new_chr;
+ iseq = khash_str2int_inc(gff->seq2int, gff->seq[gff->nseq]);
+ gff->nseq++;
+ assert( gff->nseq < 1<<29 ); // see gf_gene_t.iseq and ftr_t.iseq
}
chr_end[1] = tmp;
return iseq;
@@ -239,7 +234,6 @@ static inline void gff_parse_chr(gff_t *gff, const char *line, char **chr_beg, c
char *se = (char*) line;
while ( *se && *se!='\t' ) se++;
if ( !*se ) error("[%s:%d %s] Could not parse the line: %s\n",__FILE__,__LINE__,__FUNCTION__,line);
- if ( gff->strip_chr_names && !strncasecmp("chr",line,3) ) line += 3;
*chr_beg = (char*) line;
*chr_end = se-1;
}
@@ -633,9 +627,9 @@ static int cmp_cds_ptr(const void *a, const void *b)
return 0;
}
-static inline void chr_beg_end(aux_t *aux, int iseq, char **chr_beg, char **chr_end)
+static inline void chr_beg_end(gff_t *gff, int iseq, char **chr_beg, char **chr_end)
{
- *chr_beg = *chr_end = aux->seq[iseq];
+ *chr_beg = *chr_end = gff->seq[iseq];
while ( (*chr_end)[1] ) (*chr_end)++;
}
static gf_tscript_t *tscript_init(aux_t *aux, uint32_t trid)
@@ -674,7 +668,7 @@ static void register_utr(gff_t *gff, ftr_t *ftr)
utr->tr = tscript_init(aux, ftr->trid);
char *chr_beg, *chr_end;
- chr_beg_end(&gff->init, utr->tr->gene->iseq, &chr_beg, &chr_end);
+ chr_beg_end(gff, utr->tr->gene->iseq, &chr_beg, &chr_end);
regidx_push(gff->idx_utr, chr_beg,chr_end, utr->beg,utr->end, &utr);
}
static void register_exon(gff_t *gff, ftr_t *ftr)
@@ -686,7 +680,7 @@ static void register_exon(gff_t *gff, ftr_t *ftr)
exon->tr = tscript_init(aux, ftr->trid);
char *chr_beg, *chr_end;
- chr_beg_end(&gff->init, exon->tr->gene->iseq, &chr_beg, &chr_end);
+ chr_beg_end(gff, exon->tr->gene->iseq, &chr_beg, &chr_end);
regidx_push(gff->idx_exon, chr_beg,chr_end, exon->beg - N_SPLICE_REGION_INTRON, exon->end + N_SPLICE_REGION_INTRON, &exon);
}
@@ -703,7 +697,7 @@ static void tscript_init_cds(gff_t *gff)
// position-to-tscript lookup
char *chr_beg, *chr_end;
- chr_beg_end(aux, tr->gene->iseq, &chr_beg, &chr_end);
+ chr_beg_end(gff, tr->gene->iseq, &chr_beg, &chr_end);
regidx_push(gff->idx_tscript, chr_beg, chr_end, tr->beg, tr->end, &tr);
if ( !tr->ncds ) continue; // transcript with no CDS
@@ -914,7 +908,7 @@ static int gff_dump(gff_t *gff, const char *fname)
gf_gene_t *gene = (gf_gene_t*) kh_val(gff->init.gid2gene, k);
char *gene_id = gff->init.gene_ids.str[gene->id];
str.l = 0;
- ksprintf(&str,"%s\t.\tgene\t%"PRIu32"\t%"PRIu32"\t.\t%c\t.\tID=%s;Name=%s;used=%d\n",gff->init.seq[gene->iseq],gene->beg+1,gene->end+1,gene->strand==STRAND_FWD?'+':(gene->strand==STRAND_REV?'-':'.'),gene_id,gene->name,gene->used);
+ ksprintf(&str,"%s\t.\tgene\t%"PRIu32"\t%"PRIu32"\t.\t%c\t.\tID=%s;Name=%s;used=%d\n",gff->seq[gene->iseq],gene->beg+1,gene->end+1,gene->strand==STRAND_FWD?'+':(gene->strand==STRAND_REV?'-':'.'),gene_id,gene->name,gene->used);
if ( bgzf_write(out, str.s, str.l) != str.l ) error("Error writing %s: %s\n", fname, strerror(errno));
}
@@ -974,7 +968,7 @@ int gff_parse(gff_t *gff)
if ( gff->verbosity > 0 ) fprintf(stderr,"Parsing %s ...\n", gff->fname);
aux_t *aux = &gff->init;
- aux->seq2int = khash_str2int_init(); // chrom's numeric id
+ gff->seq2int = khash_str2int_init(); // chrom's numeric id
aux->gid2gene = kh_init(int2gene); // gene id to gf_gene_t, for idx_gene
aux->id2tr = kh_init(int2tscript); // transcript id to tscript_t
gff->idx_tscript = regidx_init(NULL, NULL, regidx_free_tscript, sizeof(gf_tscript_t*), NULL);
@@ -1010,7 +1004,16 @@ int gff_parse(gff_t *gff)
khint_t k = kh_get(int2tscript, aux->id2tr, (int)ftr->trid);
if ( k==kh_end(aux->id2tr) ) continue; // no corresponding transcript registered, must be an unsupported biotype
+ // check whether the feature respects transcript's beg,end coordinates
gf_tscript_t *tr = kh_val(aux->id2tr,k);
+ if ( ftr->beg < tr->beg || ftr->end > tr->end )
+ {
+ if ( !gff->warned.ftr_out_of_bounds || gff->verbosity > 1 )
+ fprintf(stderr,"Warning: The GFF contains features outside the transcript boundaries .. %s\n",gff_id2string(gff,transcript,tr->id));
+ gff->warned.ftr_out_of_bounds++;
+ if ( ftr->beg < tr->beg ) tr->beg = ftr->beg;
+ if ( ftr->end > tr->end ) tr->end = ftr->end;
+ }
tr->used = 1;
tr->gene->used = 1;
@@ -1022,7 +1025,7 @@ int gff_parse(gff_t *gff)
else if ( ftr->type==GF_UTR5 ) register_utr(gff, ftr);
else if ( ftr->type==GF_UTR3 ) register_utr(gff, ftr);
else
- error("something: %s\t%"PRIu32"\t%"PRIu32"\t%s\t%s\n", aux->seq[ftr->iseq],ftr->beg+1,ftr->end+1,gff->tscript_ids.str[ftr->trid],gf_type2gff_string(ftr->type));
+ error("something: %s\t%"PRIu32"\t%"PRIu32"\t%s\t%s\n", gff->seq[ftr->iseq],ftr->beg+1,ftr->end+1,gff->tscript_ids.str[ftr->trid],gf_type2gff_string(ftr->type));
}
tscript_init_cds(gff);
@@ -1074,9 +1077,7 @@ int gff_parse(gff_t *gff)
" or misc/gff2gff.py script can fix the problem (both do different things). See also the man page for the description\n"
" of the expected format http://samtools.github.io/bcftools/bcftools-man.html#csq\n");
- free(aux->seq);
free(aux->ftr);
- khash_str2int_destroy_free(aux->seq2int);
// keeping only to destroy the genes at the end: kh_destroy(int2gene,aux->gid2gene);
kh_destroy(int2tscript,aux->id2tr);
gff_id_destroy(&aux->gene_ids);
@@ -1110,7 +1111,20 @@ void gff_destroy(gff_t *gff)
regidx_destroy(gff->idx_exon);
regidx_destroy(gff->idx_tscript);
+ khash_str2int_destroy_free(gff->seq2int);
gff_id_destroy(&gff->tscript_ids);
+ free(gff->seq);
free(gff);
}
-
+int gff_has_seq(gff_t *gff, const char *seq)
+{
+ return khash_str2int_has_key(gff->seq2int, seq);
+}
+int gff_nseq(gff_t *gff)
+{
+ return gff->nseq;
+}
+const char *gff_iseq(gff_t *gff, int i)
+{
+ return i>=0 && inseq ? gff->seq[i] : NULL;
+}
diff --git a/bcftools/gff.c.pysam.c b/bcftools/gff.c.pysam.c
index 3722f606..03aea831 100644
--- a/bcftools/gff.c.pysam.c
+++ b/bcftools/gff.c.pysam.c
@@ -2,7 +2,7 @@
/* The MIT License
- Copyright (c) 2023 Genome Research Ltd.
+ Copyright (c) 2023-2025 Genome Research Ltd.
Author: Petr Danecek
@@ -89,11 +89,6 @@ typedef struct
// mapping from transcript id to tscript, for quick CDS anchoring
kh_int2tscript_t *id2tr;
- // sequences
- void *seq2int; // str2int hash
- char **seq;
- int nseq, mseq;
-
// ignored biotypes
void *ignored_biotypes;
@@ -113,18 +108,25 @@ struct gff_t_
// index iterator
regidx_t *idx_cds, *idx_utr, *idx_exon, *idx_tscript;
+ // str2int hash with parsed sequence names
+ void *seq2int;
+
// temporary structures, deleted after initializtion
aux_t init;
+ // sequences
+ char **seq;
+ int nseq, mseq;
+
// mapping between transcript id (eg. Zm00001d027245_T001) and a numeric idx
id_tbl_t tscript_ids;
- int strip_chr_names, verbosity;
+ int verbosity;
int force; // force run under various conditions. Currently only to skip out-of-phase transcripts
struct {
int unknown_chr,unknown_tscript_biotype,unknown_strand,unknown_phase,duplicate_id;
- int unknown_cds_phase,incomplete_cds,wrong_phase,overlapping_cds;
+ int unknown_cds_phase,incomplete_cds,wrong_phase,overlapping_cds,ftr_out_of_bounds;
} warned;
};
@@ -160,12 +162,6 @@ int gff_set(gff_t *gff, gff_opt_t key, ...)
va_end(args);
return 0;
- case strip_chr_names:
- va_start(args, key);
- gff->strip_chr_names = va_arg(args,int);
- va_end(args);
- return 0;
-
case verbosity:
va_start(args, key);
gff->verbosity = va_arg(args,int);
@@ -214,18 +210,17 @@ const char *gf_type2gff_string(int type)
*/
static inline int feature_set_seq(gff_t *gff, char *chr_beg, char *chr_end)
{
- aux_t *aux = &gff->init;
char tmp = chr_end[1];
chr_end[1] = 0;
int iseq;
- if ( khash_str2int_get(aux->seq2int, chr_beg, &iseq)!=0 )
+ if ( khash_str2int_get(gff->seq2int, chr_beg, &iseq)!=0 )
{
char *new_chr = strdup(chr_beg);
- hts_expand(char*, aux->nseq+1, aux->mseq, aux->seq);
- aux->seq[aux->nseq] = new_chr;
- iseq = khash_str2int_inc(aux->seq2int, aux->seq[aux->nseq]);
- aux->nseq++;
- assert( aux->nseq < 1<<29 ); // see gf_gene_t.iseq and ftr_t.iseq
+ hts_expand(char*, gff->nseq+1, gff->mseq, gff->seq);
+ gff->seq[gff->nseq] = new_chr;
+ iseq = khash_str2int_inc(gff->seq2int, gff->seq[gff->nseq]);
+ gff->nseq++;
+ assert( gff->nseq < 1<<29 ); // see gf_gene_t.iseq and ftr_t.iseq
}
chr_end[1] = tmp;
return iseq;
@@ -241,7 +236,6 @@ static inline void gff_parse_chr(gff_t *gff, const char *line, char **chr_beg, c
char *se = (char*) line;
while ( *se && *se!='\t' ) se++;
if ( !*se ) error("[%s:%d %s] Could not parse the line: %s\n",__FILE__,__LINE__,__FUNCTION__,line);
- if ( gff->strip_chr_names && !strncasecmp("chr",line,3) ) line += 3;
*chr_beg = (char*) line;
*chr_end = se-1;
}
@@ -635,9 +629,9 @@ static int cmp_cds_ptr(const void *a, const void *b)
return 0;
}
-static inline void chr_beg_end(aux_t *aux, int iseq, char **chr_beg, char **chr_end)
+static inline void chr_beg_end(gff_t *gff, int iseq, char **chr_beg, char **chr_end)
{
- *chr_beg = *chr_end = aux->seq[iseq];
+ *chr_beg = *chr_end = gff->seq[iseq];
while ( (*chr_end)[1] ) (*chr_end)++;
}
static gf_tscript_t *tscript_init(aux_t *aux, uint32_t trid)
@@ -676,7 +670,7 @@ static void register_utr(gff_t *gff, ftr_t *ftr)
utr->tr = tscript_init(aux, ftr->trid);
char *chr_beg, *chr_end;
- chr_beg_end(&gff->init, utr->tr->gene->iseq, &chr_beg, &chr_end);
+ chr_beg_end(gff, utr->tr->gene->iseq, &chr_beg, &chr_end);
regidx_push(gff->idx_utr, chr_beg,chr_end, utr->beg,utr->end, &utr);
}
static void register_exon(gff_t *gff, ftr_t *ftr)
@@ -688,7 +682,7 @@ static void register_exon(gff_t *gff, ftr_t *ftr)
exon->tr = tscript_init(aux, ftr->trid);
char *chr_beg, *chr_end;
- chr_beg_end(&gff->init, exon->tr->gene->iseq, &chr_beg, &chr_end);
+ chr_beg_end(gff, exon->tr->gene->iseq, &chr_beg, &chr_end);
regidx_push(gff->idx_exon, chr_beg,chr_end, exon->beg - N_SPLICE_REGION_INTRON, exon->end + N_SPLICE_REGION_INTRON, &exon);
}
@@ -705,7 +699,7 @@ static void tscript_init_cds(gff_t *gff)
// position-to-tscript lookup
char *chr_beg, *chr_end;
- chr_beg_end(aux, tr->gene->iseq, &chr_beg, &chr_end);
+ chr_beg_end(gff, tr->gene->iseq, &chr_beg, &chr_end);
regidx_push(gff->idx_tscript, chr_beg, chr_end, tr->beg, tr->end, &tr);
if ( !tr->ncds ) continue; // transcript with no CDS
@@ -916,7 +910,7 @@ static int gff_dump(gff_t *gff, const char *fname)
gf_gene_t *gene = (gf_gene_t*) kh_val(gff->init.gid2gene, k);
char *gene_id = gff->init.gene_ids.str[gene->id];
str.l = 0;
- ksprintf(&str,"%s\t.\tgene\t%"PRIu32"\t%"PRIu32"\t.\t%c\t.\tID=%s;Name=%s;used=%d\n",gff->init.seq[gene->iseq],gene->beg+1,gene->end+1,gene->strand==STRAND_FWD?'+':(gene->strand==STRAND_REV?'-':'.'),gene_id,gene->name,gene->used);
+ ksprintf(&str,"%s\t.\tgene\t%"PRIu32"\t%"PRIu32"\t.\t%c\t.\tID=%s;Name=%s;used=%d\n",gff->seq[gene->iseq],gene->beg+1,gene->end+1,gene->strand==STRAND_FWD?'+':(gene->strand==STRAND_REV?'-':'.'),gene_id,gene->name,gene->used);
if ( bgzf_write(out, str.s, str.l) != str.l ) error("Error writing %s: %s\n", fname, strerror(errno));
}
@@ -976,7 +970,7 @@ int gff_parse(gff_t *gff)
if ( gff->verbosity > 0 ) fprintf(bcftools_stderr,"Parsing %s ...\n", gff->fname);
aux_t *aux = &gff->init;
- aux->seq2int = khash_str2int_init(); // chrom's numeric id
+ gff->seq2int = khash_str2int_init(); // chrom's numeric id
aux->gid2gene = kh_init(int2gene); // gene id to gf_gene_t, for idx_gene
aux->id2tr = kh_init(int2tscript); // transcript id to tscript_t
gff->idx_tscript = regidx_init(NULL, NULL, regidx_free_tscript, sizeof(gf_tscript_t*), NULL);
@@ -1012,7 +1006,16 @@ int gff_parse(gff_t *gff)
khint_t k = kh_get(int2tscript, aux->id2tr, (int)ftr->trid);
if ( k==kh_end(aux->id2tr) ) continue; // no corresponding transcript registered, must be an unsupported biotype
+ // check whether the feature respects transcript's beg,end coordinates
gf_tscript_t *tr = kh_val(aux->id2tr,k);
+ if ( ftr->beg < tr->beg || ftr->end > tr->end )
+ {
+ if ( !gff->warned.ftr_out_of_bounds || gff->verbosity > 1 )
+ fprintf(bcftools_stderr,"Warning: The GFF contains features outside the transcript boundaries .. %s\n",gff_id2string(gff,transcript,tr->id));
+ gff->warned.ftr_out_of_bounds++;
+ if ( ftr->beg < tr->beg ) tr->beg = ftr->beg;
+ if ( ftr->end > tr->end ) tr->end = ftr->end;
+ }
tr->used = 1;
tr->gene->used = 1;
@@ -1024,7 +1027,7 @@ int gff_parse(gff_t *gff)
else if ( ftr->type==GF_UTR5 ) register_utr(gff, ftr);
else if ( ftr->type==GF_UTR3 ) register_utr(gff, ftr);
else
- error("something: %s\t%"PRIu32"\t%"PRIu32"\t%s\t%s\n", aux->seq[ftr->iseq],ftr->beg+1,ftr->end+1,gff->tscript_ids.str[ftr->trid],gf_type2gff_string(ftr->type));
+ error("something: %s\t%"PRIu32"\t%"PRIu32"\t%s\t%s\n", gff->seq[ftr->iseq],ftr->beg+1,ftr->end+1,gff->tscript_ids.str[ftr->trid],gf_type2gff_string(ftr->type));
}
tscript_init_cds(gff);
@@ -1076,9 +1079,7 @@ int gff_parse(gff_t *gff)
" or misc/gff2gff.py script can fix the problem (both do different things). See also the man page for the description\n"
" of the expected format http://samtools.github.io/bcftools/bcftools-man.html#csq\n");
- free(aux->seq);
free(aux->ftr);
- khash_str2int_destroy_free(aux->seq2int);
// keeping only to destroy the genes at the end: kh_destroy(int2gene,aux->gid2gene);
kh_destroy(int2tscript,aux->id2tr);
gff_id_destroy(&aux->gene_ids);
@@ -1112,7 +1113,20 @@ void gff_destroy(gff_t *gff)
regidx_destroy(gff->idx_exon);
regidx_destroy(gff->idx_tscript);
+ khash_str2int_destroy_free(gff->seq2int);
gff_id_destroy(&gff->tscript_ids);
+ free(gff->seq);
free(gff);
}
-
+int gff_has_seq(gff_t *gff, const char *seq)
+{
+ return khash_str2int_has_key(gff->seq2int, seq);
+}
+int gff_nseq(gff_t *gff)
+{
+ return gff->nseq;
+}
+const char *gff_iseq(gff_t *gff, int i)
+{
+ return i>=0 && inseq ? gff->seq[i] : NULL;
+}
diff --git a/bcftools/gff.h b/bcftools/gff.h
index afa945e8..ddde687d 100644
--- a/bcftools/gff.h
+++ b/bcftools/gff.h
@@ -1,6 +1,6 @@
/* The MIT License
- Copyright (c) 2023-2024 Genome Research Ltd.
+ Copyright (c) 2023-2025 Genome Research Ltd.
Author: Petr Danecek
@@ -289,7 +289,6 @@ typedef enum
{
// write options
verbosity, // int, 0-2
- strip_chr_names, // int, 0 to leave as is, 1 to strip 'chr' prefix
force_out_of_phase, // int, 1 to proceed even CDS exon out of expected phase
dump_fname, // const char*, dump the parsed GFF into this file, for debugging purposes
@@ -314,4 +313,9 @@ void *gff_get(gff_t *gff, gff_opt_t key);
const char *gff_id2string(gff_t *gff, id_type_t type, int id);
const char *gf_type2gff_string(int type);
+int gff_has_seq(gff_t *gff, const char *chr);
+int gff_nseq(gff_t *gff);
+const char *gff_iseq(gff_t *gff, int i);
+
+
#endif
diff --git a/bcftools/main.c b/bcftools/main.c
index 14357373..6de53680 100644
--- a/bcftools/main.c
+++ b/bcftools/main.c
@@ -265,7 +265,7 @@ int main(int argc, char *argv[])
if (argc < 2) { usage(stderr); return 1; }
if (strcmp(argv[1], "version") == 0 || strcmp(argv[1], "--version") == 0 || strcmp(argv[1], "-v") == 0) {
- printf("bcftools %s\nUsing htslib %s\nCopyright (C) 2024 Genome Research Ltd.\n", bcftools_version(), hts_version());
+ printf("bcftools %s\nUsing htslib %s\nCopyright (C) 2025 Genome Research Ltd.\n", bcftools_version(), hts_version());
#if USE_GPL
printf("License GPLv3+: GNU GPL version 3 or later \n");
#else
diff --git a/bcftools/main.c.pysam.c b/bcftools/main.c.pysam.c
index 56174fa2..9f720bae 100644
--- a/bcftools/main.c.pysam.c
+++ b/bcftools/main.c.pysam.c
@@ -267,7 +267,7 @@ int bcftools_main(int argc, char *argv[])
if (argc < 2) { usage(bcftools_stderr); return 1; }
if (strcmp(argv[1], "version") == 0 || strcmp(argv[1], "--version") == 0 || strcmp(argv[1], "-v") == 0) {
- fprintf(bcftools_stdout, "bcftools %s\nUsing htslib %s\nCopyright (C) 2024 Genome Research Ltd.\n", bcftools_version(), hts_version());
+ fprintf(bcftools_stdout, "bcftools %s\nUsing htslib %s\nCopyright (C) 2025 Genome Research Ltd.\n", bcftools_version(), hts_version());
#if USE_GPL
fprintf(bcftools_stdout, "License GPLv3+: GNU GPL version 3 or later \n");
#else
diff --git a/bcftools/mpileup.c b/bcftools/mpileup.c
index 943e0f6f..f87048ea 100644
--- a/bcftools/mpileup.c
+++ b/bcftools/mpileup.c
@@ -1,6 +1,6 @@
/* mpileup.c -- mpileup subcommand. Previously bam_plcmd.c from samtools
- Copyright (C) 2008-2024 Genome Research Ltd.
+ Copyright (C) 2008-2025 Genome Research Ltd.
Portions copyright (C) 2009-2012 Broad Institute.
Author: Heng Li
@@ -651,6 +651,7 @@ static int mpileup(mplp_conf_t *conf)
exit(EXIT_FAILURE);
}
}
+ regidx_set(conf->reg,merge_overlaps,1);
nregs = regidx_nregs(conf->reg);
if ( nregs )
{
@@ -766,20 +767,20 @@ static int mpileup(mplp_conf_t *conf)
if (conf->record_cmd_line)
{
ksprintf(&conf->buf, "##bcftoolsVersion=%s+htslib-%s\n",bcftools_version(),hts_version());
- bcf_hdr_append(conf->bcf_hdr, conf->buf.s);
+ if ( bcf_hdr_append(conf->bcf_hdr, conf->buf.s) ) error("[%s:%d] failed to update the header\n",__FILE__,__LINE__);
conf->buf.l = 0;
ksprintf(&conf->buf, "##bcftoolsCommand=mpileup");
for (i=1; iargc; i++) ksprintf(&conf->buf, " %s", conf->argv[i]);
kputc('\n', &conf->buf);
- bcf_hdr_append(conf->bcf_hdr, conf->buf.s);
+ if ( bcf_hdr_append(conf->bcf_hdr, conf->buf.s) ) error("[%s:%d] failed to update the header\n",__FILE__,__LINE__);
}
if (conf->fai_fname)
{
conf->buf.l = 0;
ksprintf(&conf->buf, "##reference=file://%s\n", conf->fai_fname);
- bcf_hdr_append(conf->bcf_hdr, conf->buf.s);
+ if ( bcf_hdr_append(conf->bcf_hdr, conf->buf.s) ) error("[%s:%d] failed to update the header\n",__FILE__,__LINE__);
}
// Translate BAM @SQ tags to BCF ##contig tags
@@ -788,7 +789,7 @@ static int mpileup(mplp_conf_t *conf)
{
conf->buf.l = 0;
ksprintf(&conf->buf, "##contig=", hdr->target_name[i], hdr->target_len[i]);
- bcf_hdr_append(conf->bcf_hdr, conf->buf.s);
+ if ( bcf_hdr_append(conf->bcf_hdr, conf->buf.s) ) error("[%s:%d] failed to update the header\n",__FILE__,__LINE__);
}
conf->buf.l = 0;
@@ -1269,6 +1270,7 @@ static void print_usage(FILE *fp, const mplp_conf_t *mplp)
" -O, --output-type TYPE 'b' compressed BCF; 'u' uncompressed BCF;\n"
" 'z' compressed VCF; 'v' uncompressed VCF; 0-9 compression level [v]\n"
" --threads INT Use multithreading with INT worker threads [0]\n"
+ " -v, --verbosity INT Verbosity level\n"
" -W, --write-index[=FMT] Automatically index the output files [off]\n"
"\n"
"SNP/INDEL genotype likelihoods options:\n"
@@ -1464,10 +1466,14 @@ int main_mpileup(int argc, char *argv[])
{"no-poly-mqual", no_argument, NULL, 26},
{"score-vs-ref",required_argument, NULL, 27},
{"seqq-offset", required_argument, NULL, 28},
+ {"verbosity",required_argument,NULL,'v'},
{NULL, 0, NULL, 0}
};
- while ((c = getopt_long(argc, argv, "Ag:f:r:R:q:Q:C:BDd:L:b:P:po:e:h:Im:F:EG:6O:xa:s:S:t:T:M:X:UW::",lopts,NULL)) >= 0) {
+ while ((c = getopt_long(argc, argv, "Ag:f:r:R:q:Q:C:BDd:L:b:P:po:e:h:Im:F:EG:6O:xa:s:S:t:T:M:X:UW::v:",lopts,NULL)) >= 0) {
switch (c) {
+ case 'v':
+ if ( apply_verbosity(optarg) < 0 ) error("Could not parse argument: --verbosity %s\n", optarg);
+ break;
case 'x': mplp.flag &= ~MPLP_SMART_OVERLAPS; break;
case 16 :
mplp.rflag_skip_any_unset = bam_str2flag(optarg);
diff --git a/bcftools/mpileup.c.pysam.c b/bcftools/mpileup.c.pysam.c
index 4458b60f..099efcd0 100644
--- a/bcftools/mpileup.c.pysam.c
+++ b/bcftools/mpileup.c.pysam.c
@@ -2,7 +2,7 @@
/* mpileup.c -- mpileup subcommand. Previously bam_plcmd.c from samtools
- Copyright (C) 2008-2024 Genome Research Ltd.
+ Copyright (C) 2008-2025 Genome Research Ltd.
Portions copyright (C) 2009-2012 Broad Institute.
Author: Heng Li
@@ -653,6 +653,7 @@ static int mpileup(mplp_conf_t *conf)
bcftools_exit(EXIT_FAILURE);
}
}
+ regidx_set(conf->reg,merge_overlaps,1);
nregs = regidx_nregs(conf->reg);
if ( nregs )
{
@@ -768,20 +769,20 @@ static int mpileup(mplp_conf_t *conf)
if (conf->record_cmd_line)
{
ksprintf(&conf->buf, "##bcftoolsVersion=%s+htslib-%s\n",bcftools_version(),hts_version());
- bcf_hdr_append(conf->bcf_hdr, conf->buf.s);
+ if ( bcf_hdr_append(conf->bcf_hdr, conf->buf.s) ) error("[%s:%d] failed to update the header\n",__FILE__,__LINE__);
conf->buf.l = 0;
ksprintf(&conf->buf, "##bcftoolsCommand=mpileup");
for (i=1; iargc; i++) ksprintf(&conf->buf, " %s", conf->argv[i]);
kputc('\n', &conf->buf);
- bcf_hdr_append(conf->bcf_hdr, conf->buf.s);
+ if ( bcf_hdr_append(conf->bcf_hdr, conf->buf.s) ) error("[%s:%d] failed to update the header\n",__FILE__,__LINE__);
}
if (conf->fai_fname)
{
conf->buf.l = 0;
ksprintf(&conf->buf, "##reference=file://%s\n", conf->fai_fname);
- bcf_hdr_append(conf->bcf_hdr, conf->buf.s);
+ if ( bcf_hdr_append(conf->bcf_hdr, conf->buf.s) ) error("[%s:%d] failed to update the header\n",__FILE__,__LINE__);
}
// Translate BAM @SQ tags to BCF ##contig tags
@@ -790,7 +791,7 @@ static int mpileup(mplp_conf_t *conf)
{
conf->buf.l = 0;
ksprintf(&conf->buf, "##contig=", hdr->target_name[i], hdr->target_len[i]);
- bcf_hdr_append(conf->bcf_hdr, conf->buf.s);
+ if ( bcf_hdr_append(conf->bcf_hdr, conf->buf.s) ) error("[%s:%d] failed to update the header\n",__FILE__,__LINE__);
}
conf->buf.l = 0;
@@ -1271,6 +1272,7 @@ static void print_usage(FILE *fp, const mplp_conf_t *mplp)
" -O, --output-type TYPE 'b' compressed BCF; 'u' uncompressed BCF;\n"
" 'z' compressed VCF; 'v' uncompressed VCF; 0-9 compression level [v]\n"
" --threads INT Use multithreading with INT worker threads [0]\n"
+ " -v, --verbosity INT Verbosity level\n"
" -W, --write-index[=FMT] Automatically index the output files [off]\n"
"\n"
"SNP/INDEL genotype likelihoods options:\n"
@@ -1466,10 +1468,14 @@ int main_mpileup(int argc, char *argv[])
{"no-poly-mqual", no_argument, NULL, 26},
{"score-vs-ref",required_argument, NULL, 27},
{"seqq-offset", required_argument, NULL, 28},
+ {"verbosity",required_argument,NULL,'v'},
{NULL, 0, NULL, 0}
};
- while ((c = getopt_long(argc, argv, "Ag:f:r:R:q:Q:C:BDd:L:b:P:po:e:h:Im:F:EG:6O:xa:s:S:t:T:M:X:UW::",lopts,NULL)) >= 0) {
+ while ((c = getopt_long(argc, argv, "Ag:f:r:R:q:Q:C:BDd:L:b:P:po:e:h:Im:F:EG:6O:xa:s:S:t:T:M:X:UW::v:",lopts,NULL)) >= 0) {
switch (c) {
+ case 'v':
+ if ( apply_verbosity(optarg) < 0 ) error("Could not parse argument: --verbosity %s\n", optarg);
+ break;
case 'x': mplp.flag &= ~MPLP_SMART_OVERLAPS; break;
case 16 :
mplp.rflag_skip_any_unset = bam_str2flag(optarg);
diff --git a/bcftools/ploidy.c b/bcftools/ploidy.c
index 550ba876..d0884dcc 100644
--- a/bcftools/ploidy.c
+++ b/bcftools/ploidy.c
@@ -1,5 +1,5 @@
-/*
- Copyright (C) 2014-2016 Genome Research Ltd.
+/*
+ Copyright (C) 2014-2025 Genome Research Ltd.
Author: Petr Danecek
@@ -9,10 +9,10 @@
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
-
+
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
-
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
@@ -58,7 +58,7 @@ int ploidy_parse(const char *line, char **chr_beg, char **chr_end, uint32_t *beg
ploidy_t *ploidy = (ploidy_t*) usr;
void *sex2id = ploidy->sex2id;
- // Check for special case of default ploidy "* * * "
+ // Check for special case of default ploidy "* * * SEX PLOIDY"
int default_ploidy_def = 0;
char *ss = (char*) line;
@@ -112,7 +112,7 @@ int ploidy_parse(const char *line, char **chr_beg, char **chr_end, uint32_t *beg
// Special case, chr="*" stands for a default value
if ( default_ploidy_def )
{
- ploidy->sex2dflt[ploidy->nsex-1] = sp->ploidy;
+ ploidy->sex2dflt[sp->sex] = sp->ploidy;
return -1;
}
@@ -212,7 +212,7 @@ int ploidy_query(ploidy_t *ploidy, char *seq, int pos, int *sex2ploidy, int *min
{
int sex = regitr_payload(ploidy->itr,sex_ploidy_t).sex;
int pld = regitr_payload(ploidy->itr,sex_ploidy_t).ploidy;
- if ( pld!=ploidy->dflt )
+ if ( pld!=ploidy->dflt )
{
if ( sex2ploidy ) sex2ploidy[ sex ] = pld;
if ( _min > pld ) _min = pld;
@@ -266,3 +266,23 @@ int ploidy_min(ploidy_t *ploidy)
return ploidy->dflt < ploidy->min ? ploidy->dflt : ploidy->min;
}
+char *ploidy_format(ploidy_t *ploidy)
+{
+ kstring_t str = {0,0,0};
+
+ regitr_t *itr = regitr_init(ploidy->idx);
+ while ( regitr_loop(itr) )
+ {
+ int id = regitr_payload(itr,sex_ploidy_t).sex;
+ int pld = regitr_payload(itr,sex_ploidy_t).ploidy;
+ ksprintf(&str,"%s\t%d\t%d\t%s\t%d\n", itr->seq, itr->beg+1, itr->end+1, ploidy->id2sex[id],pld);
+ }
+ regitr_destroy(itr);
+
+ int i;
+ for (i=0; insex; i++)
+ ksprintf(&str,"*\t*\t*\t%s\t%d\n", ploidy->id2sex[i],ploidy->sex2dflt[i]);
+
+ return str.s;
+}
+
diff --git a/bcftools/ploidy.c.pysam.c b/bcftools/ploidy.c.pysam.c
index aee0c567..02b34be1 100644
--- a/bcftools/ploidy.c.pysam.c
+++ b/bcftools/ploidy.c.pysam.c
@@ -1,7 +1,7 @@
#include "bcftools.pysam.h"
-/*
- Copyright (C) 2014-2016 Genome Research Ltd.
+/*
+ Copyright (C) 2014-2025 Genome Research Ltd.
Author: Petr Danecek
@@ -11,10 +11,10 @@
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
-
+
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
-
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
@@ -60,7 +60,7 @@ int ploidy_parse(const char *line, char **chr_beg, char **chr_end, uint32_t *beg
ploidy_t *ploidy = (ploidy_t*) usr;
void *sex2id = ploidy->sex2id;
- // Check for special case of default ploidy "* * * "
+ // Check for special case of default ploidy "* * * SEX PLOIDY"
int default_ploidy_def = 0;
char *ss = (char*) line;
@@ -114,7 +114,7 @@ int ploidy_parse(const char *line, char **chr_beg, char **chr_end, uint32_t *beg
// Special case, chr="*" stands for a default value
if ( default_ploidy_def )
{
- ploidy->sex2dflt[ploidy->nsex-1] = sp->ploidy;
+ ploidy->sex2dflt[sp->sex] = sp->ploidy;
return -1;
}
@@ -214,7 +214,7 @@ int ploidy_query(ploidy_t *ploidy, char *seq, int pos, int *sex2ploidy, int *min
{
int sex = regitr_payload(ploidy->itr,sex_ploidy_t).sex;
int pld = regitr_payload(ploidy->itr,sex_ploidy_t).ploidy;
- if ( pld!=ploidy->dflt )
+ if ( pld!=ploidy->dflt )
{
if ( sex2ploidy ) sex2ploidy[ sex ] = pld;
if ( _min > pld ) _min = pld;
@@ -268,3 +268,23 @@ int ploidy_min(ploidy_t *ploidy)
return ploidy->dflt < ploidy->min ? ploidy->dflt : ploidy->min;
}
+char *ploidy_format(ploidy_t *ploidy)
+{
+ kstring_t str = {0,0,0};
+
+ regitr_t *itr = regitr_init(ploidy->idx);
+ while ( regitr_loop(itr) )
+ {
+ int id = regitr_payload(itr,sex_ploidy_t).sex;
+ int pld = regitr_payload(itr,sex_ploidy_t).ploidy;
+ ksprintf(&str,"%s\t%d\t%d\t%s\t%d\n", itr->seq, itr->beg+1, itr->end+1, ploidy->id2sex[id],pld);
+ }
+ regitr_destroy(itr);
+
+ int i;
+ for (i=0; insex; i++)
+ ksprintf(&str,"*\t*\t*\t%s\t%d\n", ploidy->id2sex[i],ploidy->sex2dflt[i]);
+
+ return str.s;
+}
+
diff --git a/bcftools/ploidy.h b/bcftools/ploidy.h
index 7697c65f..7625bd06 100644
--- a/bcftools/ploidy.h
+++ b/bcftools/ploidy.h
@@ -1,5 +1,5 @@
-/*
- Copyright (C) 2014-2015 Genome Research Ltd.
+/*
+ Copyright (C) 2014-2025 Genome Research Ltd.
Author: Petr Danecek
@@ -9,10 +9,10 @@
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
-
+
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
-
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
@@ -81,8 +81,8 @@ void ploidy_destroy(ploidy_t *ploidy);
* @param seq: chromosome name
* @param pos: 0-based position
* @param sex2ploidy: if not NULL, array will be filled with mapping from sex id to ploidy
- * @param min: if not NULL, minimum encountered encountered will be set
- * @param max: if not NULL, maximum encountered encountered will be set
+ * @param min: if not NULL, minimum encountered ploidy will be set
+ * @param max: if not NULL, maximum encountered ploidy will be set
*
* Returns 1 if the position is listed in the regions or 0 otherwise.
*/
@@ -125,5 +125,8 @@ regidx_t *ploidy_regions(ploidy_t *ploidy);
int ploidy_max(ploidy_t *ploidy);
int ploidy_min(ploidy_t *ploidy);
+/** Create a parseable ploidy file for debugging. The string must be free()-ed by the caller */
+char *ploidy_format(ploidy_t *ploidy);
+
#endif
diff --git a/bcftools/read_consensus.c b/bcftools/read_consensus.c
index 593b19b5..f66cc7dc 100644
--- a/bcftools/read_consensus.c
+++ b/bcftools/read_consensus.c
@@ -521,7 +521,7 @@ static int create_haplotype_frequency_spectrum(read_cns_t *rcns)
}
else if ( cvar->vtype==ins )
{
- int len;
+ int len = 0;
ins_freq_t *ifrq = &rcns->ins_freq[cvar->pos - rcns->beg];
int iseq = cstate_seek_op_fwd(&cigar, cvar->pos+1, BAM_CINS, &len);
if ( iseq==-2 ) break;
@@ -533,7 +533,7 @@ static int create_haplotype_frequency_spectrum(read_cns_t *rcns)
}
else if ( cvar->vtype==del )
{
- int len;
+ int len = 0;
del_freq_t *dfrq = &rcns->del_freq[cvar->pos - rcns->beg];
int ret = cstate_seek_op_fwd(&cigar, cvar->pos+1, BAM_CDEL, &len);
if ( ret==-2 ) break;
diff --git a/bcftools/read_consensus.c.pysam.c b/bcftools/read_consensus.c.pysam.c
index ef2ff089..802ce8ed 100644
--- a/bcftools/read_consensus.c.pysam.c
+++ b/bcftools/read_consensus.c.pysam.c
@@ -523,7 +523,7 @@ static int create_haplotype_frequency_spectrum(read_cns_t *rcns)
}
else if ( cvar->vtype==ins )
{
- int len;
+ int len = 0;
ins_freq_t *ifrq = &rcns->ins_freq[cvar->pos - rcns->beg];
int iseq = cstate_seek_op_fwd(&cigar, cvar->pos+1, BAM_CINS, &len);
if ( iseq==-2 ) break;
@@ -535,7 +535,7 @@ static int create_haplotype_frequency_spectrum(read_cns_t *rcns)
}
else if ( cvar->vtype==del )
{
- int len;
+ int len = 0;
del_freq_t *dfrq = &rcns->del_freq[cvar->pos - rcns->beg];
int ret = cstate_seek_op_fwd(&cigar, cvar->pos+1, BAM_CDEL, &len);
if ( ret==-2 ) break;
diff --git a/bcftools/regidx.c b/bcftools/regidx.c
index cdaf7eaf..445d7d58 100644
--- a/bcftools/regidx.c
+++ b/bcftools/regidx.c
@@ -1,5 +1,5 @@
-/*
- Copyright (C) 2014-2018 Genome Research Ltd.
+/*
+ Copyright (C) 2014-2025 Genome Research Ltd.
Author: Petr Danecek
@@ -9,10 +9,10 @@
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
-
+
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
-
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
@@ -65,12 +65,13 @@ struct _reglist_t
void *dat; // payload data
char *seq; // sequence name
int unsorted;
-
+ int merged;
};
// Container of all sequences
struct _regidx_t
{
+ int merge_overlaps;
int nseq, mseq; // n:used, m:alloced
reglist_t *seq; // regions for each sequence
void *seq2regs; // hash for fast lookup from chr name to regions
@@ -147,6 +148,11 @@ inline int regidx_push(regidx_t *idx, char *chr_beg, char *chr_end, uint32_t beg
if ( beg > MAX_COOR_0 ) beg = MAX_COOR_0;
if ( end > MAX_COOR_0 ) end = MAX_COOR_0;
+ if ( beg > end )
+ {
+ uint32_t tmp = beg; beg = end; end = tmp;
+ }
+
int rid;
idx->str.l = 0;
kputsn(chr_beg, chr_end-chr_beg+1, &idx->str);
@@ -218,6 +224,24 @@ regidx_t *regidx_init_string(const char *str, regidx_parse_f parser, regidx_free
return idx;
}
+int regidx_set(regidx_t *idx, regidx_opt_t key, ...)
+{
+ va_list args;
+ switch (key)
+ {
+ case merge_overlaps:
+ va_start(args, key);
+ idx->merge_overlaps = va_arg(args,int);
+ va_end(args);
+ return 0;
+ default:
+ hts_log_error("Todo: regidx_set key=%d",(int)key);
+ return -1;
+ break;
+ }
+ return 0;
+}
+
regidx_t *regidx_init(const char *fname, regidx_parse_f parser, regidx_free_f free_f, size_t payload_size, void *usr_dat)
{
if ( !parser )
@@ -250,7 +274,7 @@ regidx_t *regidx_init(const char *fname, regidx_parse_f parser, regidx_free_f fr
if ( payload_size ) idx->payload = malloc(payload_size);
if ( !fname ) return idx;
-
+
kstring_t str = {0,0,0};
htsFile *fp = hts_open(fname,"r");
@@ -299,53 +323,72 @@ void regidx_destroy(regidx_t *idx)
free(idx);
}
-int _reglist_build_index(regidx_t *regidx, reglist_t *list)
+static void reglist_sort_(regidx_t *regidx, reglist_t *list)
{
- int i;
- if ( list->unsorted )
+ if ( !list->unsorted ) return;
+
+ if ( !regidx->payload_size )
+ qsort(list->reg,list->nreg,sizeof(reg_t),cmp_reg_ptrs);
+ else
{
- if ( !regidx->payload_size )
- qsort(list->reg,list->nreg,sizeof(reg_t),cmp_reg_ptrs);
- else
+ int i;
+ reg_t **ptr = (reg_t**) malloc(sizeof(reg_t*)*list->nreg);
+ for (i=0; inreg; i++) ptr[i] = list->reg + i;
+ qsort(ptr,list->nreg,sizeof(*ptr),cmp_reg_ptrs2);
+
+ void *tmp_dat = malloc(regidx->payload_size*list->nreg);
+ for (i=0; inreg; i++)
{
- reg_t **ptr = (reg_t**) malloc(sizeof(reg_t*)*list->nreg);
- for (i=0; inreg; i++) ptr[i] = list->reg + i;
- qsort(ptr,list->nreg,sizeof(*ptr),cmp_reg_ptrs2);
-
- void *tmp_dat = malloc(regidx->payload_size*list->nreg);
- for (i=0; inreg; i++)
- {
- size_t iori = ptr[i] - list->reg;
- memcpy((char *)tmp_dat+i*regidx->payload_size,
- (char *)list->dat+iori*regidx->payload_size,
- regidx->payload_size);
- }
- free(list->dat);
- list->dat = tmp_dat;
-
- reg_t *tmp_reg = (reg_t*) malloc(sizeof(reg_t)*list->nreg);
- for (i=0; inreg; i++)
- {
- size_t iori = ptr[i] - list->reg;
- tmp_reg[i] = list->reg[iori];
- }
- free(ptr);
- free(list->reg);
- list->reg = tmp_reg;
- list->mreg = list->nreg;
+ size_t iori = ptr[i] - list->reg;
+ memcpy((char *)tmp_dat+i*regidx->payload_size,
+ (char *)list->dat+iori*regidx->payload_size,
+ regidx->payload_size);
}
- list->unsorted = 0;
+ free(list->dat);
+ list->dat = tmp_dat;
+
+ reg_t *tmp_reg = (reg_t*) malloc(sizeof(reg_t)*list->nreg);
+ for (i=0; inreg; i++)
+ {
+ size_t iori = ptr[i] - list->reg;
+ tmp_reg[i] = list->reg[iori];
+ }
+ free(ptr);
+ free(list->reg);
+ list->reg = tmp_reg;
+ list->mreg = list->nreg;
}
+ list->unsorted = 0;
+}
+static void reglist_merge_(regidx_t *regidx, reglist_t *list)
+{
+ if ( list->merged ) return;
+ int j;
+ for (j=1; jnreg; j++)
+ {
+ if ( list->reg[j-1].end < list->reg[j].beg ) continue;
+ if ( list->reg[j-1].end < list->reg[j].end ) list->reg[j-1].end = list->reg[j].end;
+ if ( j+1 < list->nreg ) memmove(&list->reg[j],&list->reg[j+1],(list->nreg-j-1)*sizeof(*list->reg));
+ j--;
+ list->nreg--;
+ }
+ list->merged = 1;
+}
+
+int _reglist_build_index(regidx_t *regidx, reglist_t *list)
+{
+ reglist_sort_(regidx,list);
+ if ( regidx->merge_overlaps ) reglist_merge_(regidx,list);
list->nidx = 0;
- int j,k, midx = 0;
+ int j, k, midx = 0;
for (j=0; jnreg; j++)
{
int ibeg = iBIN(list->reg[j].beg);
int iend = iBIN(list->reg[j].end);
if ( midx <= iend )
{
- int old_midx = midx;
+ int old_midx = midx;
midx = iend + 1;
kroundup32(midx);
list->idx = (uint32_t*) realloc(list->idx, midx*sizeof(uint32_t));
@@ -436,7 +479,7 @@ int regidx_parse_bed(const char *line, char **chr_beg, char **chr_end, uint32_t
while ( *ss && isspace(*ss) ) ss++;
if ( !*ss ) return -1; // skip blank lines
if ( *ss=='#' ) return -1; // skip comments
-
+
char *se = ss;
while ( *se && !isspace(*se) ) se++;
@@ -458,7 +501,7 @@ int regidx_parse_bed(const char *line, char **chr_beg, char **chr_end, uint32_t
ss = se+1;
*end = strtod(ss, &se) - 1;
if ( ss==se ) { fprintf(stderr,"Could not parse bed line: %s\n", line); return -2; }
-
+
return 0;
}
@@ -468,7 +511,7 @@ int regidx_parse_tab(const char *line, char **chr_beg, char **chr_end, uint32_t
while ( *ss && isspace(*ss) ) ss++;
if ( !*ss ) return -1; // skip blank lines
if ( *ss=='#' ) return -1; // skip comments
-
+
char *se = ss;
while ( *se && !isspace(*se) ) se++;
@@ -515,7 +558,7 @@ int regidx_parse_reg(const char *line, char **chr_beg, char **chr_end, uint32_t
while ( *ss && isspace(*ss) ) ss++;
if ( !*ss ) return -1; // skip blank lines
if ( *ss=='#' ) return -1; // skip comments
-
+
char *se = ss;
while ( *se && *se!=':' ) se++;
@@ -627,6 +670,12 @@ int regitr_loop(regitr_t *regitr)
itr->list = ®idx->seq[iseq];
}
+ if ( regidx->merge_overlaps )
+ {
+ reglist_sort_(regidx,itr->list);
+ reglist_merge_(regidx,itr->list);
+ }
+
regitr->seq = itr->list->seq;
regitr->beg = itr->list->reg[itr->ireg].beg;
regitr->end = itr->list->reg[itr->ireg].end;
diff --git a/bcftools/regidx.c.pysam.c b/bcftools/regidx.c.pysam.c
index 4eb96e87..23df04c0 100644
--- a/bcftools/regidx.c.pysam.c
+++ b/bcftools/regidx.c.pysam.c
@@ -1,7 +1,7 @@
#include "bcftools.pysam.h"
-/*
- Copyright (C) 2014-2018 Genome Research Ltd.
+/*
+ Copyright (C) 2014-2025 Genome Research Ltd.
Author: Petr Danecek
@@ -11,10 +11,10 @@
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
-
+
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
-
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
@@ -67,12 +67,13 @@ struct _reglist_t
void *dat; // payload data
char *seq; // sequence name
int unsorted;
-
+ int merged;
};
// Container of all sequences
struct _regidx_t
{
+ int merge_overlaps;
int nseq, mseq; // n:used, m:alloced
reglist_t *seq; // regions for each sequence
void *seq2regs; // hash for fast lookup from chr name to regions
@@ -149,6 +150,11 @@ inline int regidx_push(regidx_t *idx, char *chr_beg, char *chr_end, uint32_t beg
if ( beg > MAX_COOR_0 ) beg = MAX_COOR_0;
if ( end > MAX_COOR_0 ) end = MAX_COOR_0;
+ if ( beg > end )
+ {
+ uint32_t tmp = beg; beg = end; end = tmp;
+ }
+
int rid;
idx->str.l = 0;
kputsn(chr_beg, chr_end-chr_beg+1, &idx->str);
@@ -220,6 +226,24 @@ regidx_t *regidx_init_string(const char *str, regidx_parse_f parser, regidx_free
return idx;
}
+int regidx_set(regidx_t *idx, regidx_opt_t key, ...)
+{
+ va_list args;
+ switch (key)
+ {
+ case merge_overlaps:
+ va_start(args, key);
+ idx->merge_overlaps = va_arg(args,int);
+ va_end(args);
+ return 0;
+ default:
+ hts_log_error("Todo: regidx_set key=%d",(int)key);
+ return -1;
+ break;
+ }
+ return 0;
+}
+
regidx_t *regidx_init(const char *fname, regidx_parse_f parser, regidx_free_f free_f, size_t payload_size, void *usr_dat)
{
if ( !parser )
@@ -252,7 +276,7 @@ regidx_t *regidx_init(const char *fname, regidx_parse_f parser, regidx_free_f fr
if ( payload_size ) idx->payload = malloc(payload_size);
if ( !fname ) return idx;
-
+
kstring_t str = {0,0,0};
htsFile *fp = hts_open(fname,"r");
@@ -301,53 +325,72 @@ void regidx_destroy(regidx_t *idx)
free(idx);
}
-int _reglist_build_index(regidx_t *regidx, reglist_t *list)
+static void reglist_sort_(regidx_t *regidx, reglist_t *list)
{
- int i;
- if ( list->unsorted )
+ if ( !list->unsorted ) return;
+
+ if ( !regidx->payload_size )
+ qsort(list->reg,list->nreg,sizeof(reg_t),cmp_reg_ptrs);
+ else
{
- if ( !regidx->payload_size )
- qsort(list->reg,list->nreg,sizeof(reg_t),cmp_reg_ptrs);
- else
+ int i;
+ reg_t **ptr = (reg_t**) malloc(sizeof(reg_t*)*list->nreg);
+ for (i=0; inreg; i++) ptr[i] = list->reg + i;
+ qsort(ptr,list->nreg,sizeof(*ptr),cmp_reg_ptrs2);
+
+ void *tmp_dat = malloc(regidx->payload_size*list->nreg);
+ for (i=0; inreg; i++)
{
- reg_t **ptr = (reg_t**) malloc(sizeof(reg_t*)*list->nreg);
- for (i=0; inreg; i++) ptr[i] = list->reg + i;
- qsort(ptr,list->nreg,sizeof(*ptr),cmp_reg_ptrs2);
-
- void *tmp_dat = malloc(regidx->payload_size*list->nreg);
- for (i=0; inreg; i++)
- {
- size_t iori = ptr[i] - list->reg;
- memcpy((char *)tmp_dat+i*regidx->payload_size,
- (char *)list->dat+iori*regidx->payload_size,
- regidx->payload_size);
- }
- free(list->dat);
- list->dat = tmp_dat;
-
- reg_t *tmp_reg = (reg_t*) malloc(sizeof(reg_t)*list->nreg);
- for (i=0; inreg; i++)
- {
- size_t iori = ptr[i] - list->reg;
- tmp_reg[i] = list->reg[iori];
- }
- free(ptr);
- free(list->reg);
- list->reg = tmp_reg;
- list->mreg = list->nreg;
+ size_t iori = ptr[i] - list->reg;
+ memcpy((char *)tmp_dat+i*regidx->payload_size,
+ (char *)list->dat+iori*regidx->payload_size,
+ regidx->payload_size);
}
- list->unsorted = 0;
+ free(list->dat);
+ list->dat = tmp_dat;
+
+ reg_t *tmp_reg = (reg_t*) malloc(sizeof(reg_t)*list->nreg);
+ for (i=0; inreg; i++)
+ {
+ size_t iori = ptr[i] - list->reg;
+ tmp_reg[i] = list->reg[iori];
+ }
+ free(ptr);
+ free(list->reg);
+ list->reg = tmp_reg;
+ list->mreg = list->nreg;
}
+ list->unsorted = 0;
+}
+static void reglist_merge_(regidx_t *regidx, reglist_t *list)
+{
+ if ( list->merged ) return;
+ int j;
+ for (j=1; jnreg; j++)
+ {
+ if ( list->reg[j-1].end < list->reg[j].beg ) continue;
+ if ( list->reg[j-1].end < list->reg[j].end ) list->reg[j-1].end = list->reg[j].end;
+ if ( j+1 < list->nreg ) memmove(&list->reg[j],&list->reg[j+1],(list->nreg-j-1)*sizeof(*list->reg));
+ j--;
+ list->nreg--;
+ }
+ list->merged = 1;
+}
+
+int _reglist_build_index(regidx_t *regidx, reglist_t *list)
+{
+ reglist_sort_(regidx,list);
+ if ( regidx->merge_overlaps ) reglist_merge_(regidx,list);
list->nidx = 0;
- int j,k, midx = 0;
+ int j, k, midx = 0;
for (j=0; jnreg; j++)
{
int ibeg = iBIN(list->reg[j].beg);
int iend = iBIN(list->reg[j].end);
if ( midx <= iend )
{
- int old_midx = midx;
+ int old_midx = midx;
midx = iend + 1;
kroundup32(midx);
list->idx = (uint32_t*) realloc(list->idx, midx*sizeof(uint32_t));
@@ -438,7 +481,7 @@ int regidx_parse_bed(const char *line, char **chr_beg, char **chr_end, uint32_t
while ( *ss && isspace(*ss) ) ss++;
if ( !*ss ) return -1; // skip blank lines
if ( *ss=='#' ) return -1; // skip comments
-
+
char *se = ss;
while ( *se && !isspace(*se) ) se++;
@@ -460,7 +503,7 @@ int regidx_parse_bed(const char *line, char **chr_beg, char **chr_end, uint32_t
ss = se+1;
*end = strtod(ss, &se) - 1;
if ( ss==se ) { fprintf(bcftools_stderr,"Could not parse bed line: %s\n", line); return -2; }
-
+
return 0;
}
@@ -470,7 +513,7 @@ int regidx_parse_tab(const char *line, char **chr_beg, char **chr_end, uint32_t
while ( *ss && isspace(*ss) ) ss++;
if ( !*ss ) return -1; // skip blank lines
if ( *ss=='#' ) return -1; // skip comments
-
+
char *se = ss;
while ( *se && !isspace(*se) ) se++;
@@ -517,7 +560,7 @@ int regidx_parse_reg(const char *line, char **chr_beg, char **chr_end, uint32_t
while ( *ss && isspace(*ss) ) ss++;
if ( !*ss ) return -1; // skip blank lines
if ( *ss=='#' ) return -1; // skip comments
-
+
char *se = ss;
while ( *se && *se!=':' ) se++;
@@ -629,6 +672,12 @@ int regitr_loop(regitr_t *regitr)
itr->list = ®idx->seq[iseq];
}
+ if ( regidx->merge_overlaps )
+ {
+ reglist_sort_(regidx,itr->list);
+ reglist_merge_(regidx,itr->list);
+ }
+
regitr->seq = itr->list->seq;
regitr->beg = itr->list->reg[itr->ireg].beg;
regitr->end = itr->list->reg[itr->ireg].end;
diff --git a/bcftools/regidx.h b/bcftools/regidx.h
index 09c43f89..05167bbf 100644
--- a/bcftools/regidx.h
+++ b/bcftools/regidx.h
@@ -1,5 +1,5 @@
-/*
- Copyright (C) 2014-2016, 2018 Genome Research Ltd.
+/*
+ Copyright (C) 2014-2025 Genome Research Ltd.
Author: Petr Danecek
@@ -9,10 +9,10 @@
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
-
+
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
-
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
@@ -39,7 +39,7 @@
while ( regitr_overlap(itr) )
{
- printf("[%d,%d] overlaps with [%d,%d], payload=%s\n", beg,end,
+ printf("[%d,%d] overlaps with [%d,%d], payload=%s\n", beg,end,
itr->beg+1, itr->end+1, regitr_payload(itr,char*));
}
@@ -48,7 +48,7 @@
Another example, loop over all regions:
-
+
regidx_t *idx = regidx_init(in_fname,NULL,NULL,0,NULL);
regitr_t *itr = regitr_init(idx);
@@ -105,6 +105,15 @@ regitr_t;
#define regitr_payload(itr,type_t) (*((type_t*)(itr)->payload))
+typedef enum
+{
+ merge_overlaps, // merge overlapping regions
+}
+regidx_opt_t;
+
+int regidx_set(regidx_t *idx, regidx_opt_t key, ...); // returns 0 on success
+
+
/*
* regidx_parse_f - Function to parse one input line, such as regidx_parse_bed
* or regidx_parse_tab below. The function is expected to set `chr_from` and
@@ -121,7 +130,7 @@ typedef int (*regidx_parse_f)(const char *line, char **chr_beg, char **chr_end,
typedef void (*regidx_free_f)(void *payload);
/*
- * A note about the parsers:
+ * A note about the parsers:
* - leading spaces are ignored
* - lines starting with "#" are ignored
*/
@@ -164,7 +173,7 @@ void regidx_destroy(regidx_t *idx);
int regidx_overlap(regidx_t *idx, const char *chr, uint32_t beg, uint32_t end, regitr_t *itr);
/*
- * regidx_insert() - add a new region.
+ * regidx_insert() - add a new region.
* regidx_insert_list() - add new regions from a list
* regidx_push() - low level insertion of a new region
*
diff --git a/bcftools/reheader.c b/bcftools/reheader.c
index 37e5d965..1d4e85a2 100644
--- a/bcftools/reheader.c
+++ b/bcftools/reheader.c
@@ -1,6 +1,6 @@
/* reheader.c -- reheader subcommand.
- Copyright (C) 2014-2022,2024 Genome Research Ltd.
+ Copyright (C) 2014-2025 Genome Research Ltd.
Author: Petr Danecek
@@ -418,7 +418,7 @@ static void reheader_vcf_gz(args_t *args)
// Output all remaining data read with the header block
if ( fp->block_length - skip_until > 0 )
{
- if ( bgzf_write(bgzf_out, buffer+skip_until, fp->block_length-skip_until)<0 ) error("Error: %d\n",fp->errcode);
+ if ( bgzf_write(bgzf_out, buffer+skip_until, fp->block_length-skip_until)<0 ) error("Error: %d\n",bgzf_out->errcode);
}
if ( bgzf_flush(bgzf_out)<0 ) error("Error: %d\n",bgzf_out->errcode);
@@ -434,8 +434,8 @@ static void reheader_vcf_gz(args_t *args)
int count = bgzf_raw_write(bgzf_out, buf, nread);
if (count != nread) error("Write failed, wrote %d instead of %d bytes.\n", count,(int)nread);
}
- if (bgzf_close(bgzf_out) < 0) error("Error closing %s: %d\n",args->output_fname ? args->output_fname : "-",bgzf_out->errcode);
- if (hts_close(args->fp)) error("Error closing %s: %d\n",args->fname,fp->errcode);
+ if (bgzf_close(bgzf_out) < 0) error("Error closing %s: %s\n",args->output_fname ? args->output_fname : "-",strerror(errno));
+ if (hts_close(args->fp)) error("Error closing %s: %s\n",args->fname,strerror(errno));
free(buf);
}
static void reheader_vcf(args_t *args)
@@ -661,12 +661,13 @@ static void usage(args_t *args)
fprintf(stderr, "Usage: bcftools reheader [OPTIONS] \n");
fprintf(stderr, "\n");
fprintf(stderr, "Options:\n");
- fprintf(stderr, " -f, --fai FILE update sequences and their lengths from the .fai file\n");
- fprintf(stderr, " -h, --header FILE new header\n");
- fprintf(stderr, " -o, --output FILE write output to a file [standard output]\n");
- fprintf(stderr, " -s, --samples FILE new sample names\n");
- fprintf(stderr, " -T, --temp-prefix PATH ignored; was template for temporary file name\n");
- fprintf(stderr, " --threads INT use multithreading with worker threads (BCF only) [0]\n");
+ fprintf(stderr, " -f, --fai FILE Update sequences and their lengths from the .fai file\n");
+ fprintf(stderr, " -h, --header FILE New header\n");
+ fprintf(stderr, " -o, --output FILE Write output to a file [standard output]\n");
+ fprintf(stderr, " -s, --samples FILE New sample names\n");
+ fprintf(stderr, " -T, --temp-prefix PATH Ignored; was template for temporary file name\n");
+ fprintf(stderr, " --threads INT Use multithreading with worker threads (BCF only) [0]\n");
+ fprintf(stderr, " -v, --verbosity INT Verbosity level\n");
fprintf(stderr, "\n");
fprintf(stderr, "Example:\n");
fprintf(stderr, " # Write out the header to be modified\n");
@@ -695,12 +696,16 @@ int main_reheader(int argc, char *argv[])
{"header",1,0,'h'},
{"samples",1,0,'s'},
{"threads",1,NULL,1},
+ {"verbosity",required_argument,NULL,'v'},
{0,0,0,0}
};
- while ((c = getopt_long(argc, argv, "s:h:o:f:T:",loptions,NULL)) >= 0)
+ while ((c = getopt_long(argc, argv, "s:h:o:f:T:v:",loptions,NULL)) >= 0)
{
switch (c)
{
+ case 'v':
+ if ( apply_verbosity(optarg) < 0 ) error("Could not parse argument: --verbosity %s\n", optarg);
+ break;
case 1 : args->n_threads = strtol(optarg, 0, 0); break;
case 'T': break; // unused - was temp file prefix
case 'f': args->fai_fname = optarg; break;
diff --git a/bcftools/reheader.c.pysam.c b/bcftools/reheader.c.pysam.c
index 87d460a8..8e81a688 100644
--- a/bcftools/reheader.c.pysam.c
+++ b/bcftools/reheader.c.pysam.c
@@ -2,7 +2,7 @@
/* reheader.c -- reheader subcommand.
- Copyright (C) 2014-2022,2024 Genome Research Ltd.
+ Copyright (C) 2014-2025 Genome Research Ltd.
Author: Petr Danecek
@@ -420,7 +420,7 @@ static void reheader_vcf_gz(args_t *args)
// Output all remaining data read with the header block
if ( fp->block_length - skip_until > 0 )
{
- if ( bgzf_write(bgzf_out, buffer+skip_until, fp->block_length-skip_until)<0 ) error("Error: %d\n",fp->errcode);
+ if ( bgzf_write(bgzf_out, buffer+skip_until, fp->block_length-skip_until)<0 ) error("Error: %d\n",bgzf_out->errcode);
}
if ( bgzf_flush(bgzf_out)<0 ) error("Error: %d\n",bgzf_out->errcode);
@@ -436,8 +436,8 @@ static void reheader_vcf_gz(args_t *args)
int count = bgzf_raw_write(bgzf_out, buf, nread);
if (count != nread) error("Write failed, wrote %d instead of %d bytes.\n", count,(int)nread);
}
- if (bgzf_close(bgzf_out) < 0) error("Error closing %s: %d\n",args->output_fname ? args->output_fname : "-",bgzf_out->errcode);
- if (hts_close(args->fp)) error("Error closing %s: %d\n",args->fname,fp->errcode);
+ if (bgzf_close(bgzf_out) < 0) error("Error closing %s: %s\n",args->output_fname ? args->output_fname : "-",strerror(errno));
+ if (hts_close(args->fp)) error("Error closing %s: %s\n",args->fname,strerror(errno));
free(buf);
}
static void reheader_vcf(args_t *args)
@@ -663,12 +663,13 @@ static void usage(args_t *args)
fprintf(bcftools_stderr, "Usage: bcftools reheader [OPTIONS] \n");
fprintf(bcftools_stderr, "\n");
fprintf(bcftools_stderr, "Options:\n");
- fprintf(bcftools_stderr, " -f, --fai FILE update sequences and their lengths from the .fai file\n");
- fprintf(bcftools_stderr, " -h, --header FILE new header\n");
- fprintf(bcftools_stderr, " -o, --output FILE write output to a file [standard output]\n");
- fprintf(bcftools_stderr, " -s, --samples FILE new sample names\n");
- fprintf(bcftools_stderr, " -T, --temp-prefix PATH ignored; was template for temporary file name\n");
- fprintf(bcftools_stderr, " --threads INT use multithreading with worker threads (BCF only) [0]\n");
+ fprintf(bcftools_stderr, " -f, --fai FILE Update sequences and their lengths from the .fai file\n");
+ fprintf(bcftools_stderr, " -h, --header FILE New header\n");
+ fprintf(bcftools_stderr, " -o, --output FILE Write output to a file [standard output]\n");
+ fprintf(bcftools_stderr, " -s, --samples FILE New sample names\n");
+ fprintf(bcftools_stderr, " -T, --temp-prefix PATH Ignored; was template for temporary file name\n");
+ fprintf(bcftools_stderr, " --threads INT Use multithreading with worker threads (BCF only) [0]\n");
+ fprintf(bcftools_stderr, " -v, --verbosity INT Verbosity level\n");
fprintf(bcftools_stderr, "\n");
fprintf(bcftools_stderr, "Example:\n");
fprintf(bcftools_stderr, " # Write out the header to be modified\n");
@@ -697,12 +698,16 @@ int main_reheader(int argc, char *argv[])
{"header",1,0,'h'},
{"samples",1,0,'s'},
{"threads",1,NULL,1},
+ {"verbosity",required_argument,NULL,'v'},
{0,0,0,0}
};
- while ((c = getopt_long(argc, argv, "s:h:o:f:T:",loptions,NULL)) >= 0)
+ while ((c = getopt_long(argc, argv, "s:h:o:f:T:v:",loptions,NULL)) >= 0)
{
switch (c)
{
+ case 'v':
+ if ( apply_verbosity(optarg) < 0 ) error("Could not parse argument: --verbosity %s\n", optarg);
+ break;
case 1 : args->n_threads = strtol(optarg, 0, 0); break;
case 'T': break; // unused - was temp file prefix
case 'f': args->fai_fname = optarg; break;
diff --git a/bcftools/smpl_ilist.c b/bcftools/smpl_ilist.c
index e3fbaccf..4bc4cec2 100644
--- a/bcftools/smpl_ilist.c
+++ b/bcftools/smpl_ilist.c
@@ -1,4 +1,4 @@
-/*
+/*
Copyright (C) 2016-2021 Genome Research Ltd.
Author: Petr Danecek
@@ -9,10 +9,10 @@
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
-
+
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
-
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
@@ -162,7 +162,7 @@ smpl_ilist_t *smpl_ilist_map(bcf_hdr_t *hdr_a, bcf_hdr_t *hdr_b, int flags)
{
const char *name = bcf_hdr_int2id(hdr_a, BCF_DT_SAMPLE, i);
smpl->idx[i] = bcf_hdr_id2int(hdr_b, BCF_DT_SAMPLE, name);
- if ( flags&SMPL_STRICT && smpl->idx[i]<0 )
+ if ( flags&SMPL_STRICT && smpl->idx[i]<0 )
error("The sample %s is not present in the second file\n", name);
}
return smpl;
diff --git a/bcftools/smpl_ilist.c.pysam.c b/bcftools/smpl_ilist.c.pysam.c
index 68ed5279..e251b5ea 100644
--- a/bcftools/smpl_ilist.c.pysam.c
+++ b/bcftools/smpl_ilist.c.pysam.c
@@ -1,6 +1,6 @@
#include "bcftools.pysam.h"
-/*
+/*
Copyright (C) 2016-2021 Genome Research Ltd.
Author: Petr Danecek
@@ -11,10 +11,10 @@
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
-
+
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
-
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
@@ -164,7 +164,7 @@ smpl_ilist_t *smpl_ilist_map(bcf_hdr_t *hdr_a, bcf_hdr_t *hdr_b, int flags)
{
const char *name = bcf_hdr_int2id(hdr_a, BCF_DT_SAMPLE, i);
smpl->idx[i] = bcf_hdr_id2int(hdr_b, BCF_DT_SAMPLE, name);
- if ( flags&SMPL_STRICT && smpl->idx[i]<0 )
+ if ( flags&SMPL_STRICT && smpl->idx[i]<0 )
error("The sample %s is not present in the second file\n", name);
}
return smpl;
diff --git a/bcftools/vcfannotate.c b/bcftools/vcfannotate.c
index b66c8cf5..238c2e98 100644
--- a/bcftools/vcfannotate.c
+++ b/bcftools/vcfannotate.c
@@ -1,6 +1,6 @@
/* vcfannotate.c -- Annotate and edit VCF/BCF files.
- Copyright (C) 2013-2024 Genome Research Ltd.
+ Copyright (C) 2013-2025 Genome Research Ltd.
Author: Petr Danecek
@@ -774,6 +774,7 @@ static int vcf_setter_info_flag(args_t *args, bcf1_t *line, annot_col_t *col, vo
}
static int setter_ARinfo_int32(args_t *args, bcf1_t *line, annot_col_t *col, int nals, char **als, int ntmpi)
{
+ if ( !nals ) error("Cannot fill Number=R,A tags without --columns ..,REF,ALT,..\n");
if ( col->number==BCF_VL_A && ntmpi!=nals-1 && (ntmpi!=1 || args->tmpi[0]!=bcf_int32_missing || args->tmpi[1]!=bcf_int32_vector_end) )
error("Incorrect number of values (%d) for the %s tag at %s:%"PRId64"\n", ntmpi,col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1);
else if ( col->number==BCF_VL_R && ntmpi!=nals && (ntmpi!=1 || args->tmpi[0]!=bcf_int32_missing || args->tmpi[1]!=bcf_int32_vector_end) )
@@ -855,8 +856,8 @@ static int setter_info_int(args_t *args, bcf1_t *line, annot_col_t *col, void *d
else
{
args->tmpi[ntmpi-1] = strtol(str, &end, 10);
- if ( end==str )
- error("Could not parse %s at %s:%"PRId64" .. [%s]\n", col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1,tab->cols[col->icol]);
+ if ( end==str || (*end && *end!=',') )
+ error("Could not parse %s (Type=Integer) at %s:%"PRId64" .. [%s]\n", col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1,tab->cols[col->icol]);
str = end+1;
}
}
@@ -938,6 +939,7 @@ static int vcf_setter_info_int(args_t *args, bcf1_t *line, annot_col_t *col, voi
}
static int setter_ARinfo_real(args_t *args, bcf1_t *line, annot_col_t *col, int nals, char **als, int ntmpf)
{
+ if ( !nals ) error("Cannot fill Number=R,A tags without --columns ..,REF,ALT,..\n");
if ( col->number==BCF_VL_A && ntmpf!=nals-1 && (ntmpf!=1 || !bcf_float_is_missing(args->tmpf[0]) || !bcf_float_is_vector_end(args->tmpf[0])) )
error("Incorrect number of values (%d) for the %s tag at %s:%"PRId64"\n", ntmpf,col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1);
else if ( col->number==BCF_VL_R && ntmpf!=nals && (ntmpf!=1 || !bcf_float_is_missing(args->tmpf[0]) || !bcf_float_is_vector_end(args->tmpf[0])) )
@@ -1124,6 +1126,7 @@ int copy_string_field(char *src, int isrc, int src_len, kstring_t *dst, int idst
static int setter_ARinfo_string(args_t *args, bcf1_t *line, annot_col_t *col, int nals, char **als)
{
assert( col->merge_method==MM_FIRST );
+ if ( !nals ) error("Cannot fill Number=R,A tags without --columns ..,REF,ALT,..\n");
int nsrc = 1, lsrc = 0;
while ( args->tmps[lsrc] )
@@ -1668,8 +1671,8 @@ static int setter_format_int(args_t *args, bcf1_t *line, annot_col_t *col, void
char *end = str;
ptr[ival] = strtol(str, &end, 10);
- if ( end==str )
- error("Could not parse %s at %s:%"PRId64" .. [%s]\n", col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1,tab->cols[col->icol]);
+ if ( end==str || (*end && *end!=',') )
+ error("Could not parse %s (Type=Integer) at %s:%"PRId64" .. [%s]\n", col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1,tab->cols[col->icol]);
ival++;
str = *end ? end+1 : end;
@@ -2313,7 +2316,7 @@ static void init_columns(args_t *args)
col->hdr_key_src = strdup(str.s);
col->hdr_key_dst = strdup(str.s);
col->replace = replace;
- if ( args->pair_logic==-1 ) bcf_sr_set_opt(args->files,BCF_SR_PAIR_LOGIC,BCF_SR_PAIR_BOTH_REF);
+ if ( args->pair_logic==-1 ) args->pair_logic = BCF_SR_PAIR_ANY;
}
else args->alt_idx = icol;
}
@@ -2321,7 +2324,6 @@ static void init_columns(args_t *args)
{
if ( replace & REPLACE_NON_MISSING ) error("Apologies, the -ID feature has not been implemented yet.\n");
if ( str.s[0]=='~' ) replace = MATCH_VALUE;
- if ( args->tgts_is_vcf && (replace & MATCH_VALUE) ) error("todo: -c ~ID with -a VCF?\n");
args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols);
annot_col_t *col = &args->cols[args->ncols-1];
memset(col,0,sizeof(*col));
@@ -2330,7 +2332,11 @@ static void init_columns(args_t *args)
col->setter = args->tgts_is_vcf ? vcf_setter_id : setter_id;
col->hdr_key_src = strdup(str.s);
col->hdr_key_dst = strdup(str.s);
- if ( replace & MATCH_VALUE ) args->match_id = icol;
+ if ( replace & MATCH_VALUE )
+ {
+ args->match_id = icol;
+ if ( args->tgts_is_vcf ) args->pair_logic = (args->pair_logic==-1) ? BCF_SR_PAIR_ID : args->pair_logic|BCF_SR_PAIR_ID;
+ }
}
else if ( !strcasecmp("~INFO/END",str.s) && !args->tgts_is_vcf )
{
@@ -2408,8 +2414,8 @@ static void init_columns(args_t *args)
col->hdr_key_src = strdup(ptr+2);
col->hdr_key_dst = strdup(str.s+5);
tmp.l = 0;
- ksprintf(&tmp,"##INFO=",col->hdr_key_dst);
- bcf_hdr_append(args->hdr_out, tmp.s);
+ ksprintf(&tmp,"##INFO=",col->hdr_key_dst);
+ if ( bcf_hdr_append(args->hdr_out, tmp.s) ) error("[%s:%d] failed to update the header\n",__FILE__,__LINE__);
if (bcf_hdr_sync(args->hdr_out) < 0) error_errno("[%s] Failed to update header", __func__);
int hdr_id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, col->hdr_key_dst);
col->number = bcf_hdr_id2length(args->hdr_out,BCF_HL_INFO,hdr_id);
@@ -2441,7 +2447,7 @@ static void init_columns(args_t *args)
if ( k<0 ) error("[%s] Failed to parse the header, the ID attribute not found", __func__);
tmp.l = 0;
bcf_hrec_format(hrec, &tmp);
- bcf_hdr_append(args->hdr_out, tmp.s);
+ if ( bcf_hdr_append(args->hdr_out, tmp.s) ) error("[%s:%d] failed to update the header\n",__FILE__,__LINE__);
}
if (bcf_hdr_sync(args->hdr_out) < 0)
error_errno("[%s] Failed to update header", __func__);
@@ -2475,7 +2481,7 @@ static void init_columns(args_t *args)
if ( skip_info && khash_str2int_has_key(skip_info,hrec->vals[k]) ) continue;
tmp.l = 0;
bcf_hrec_format(hrec, &tmp);
- bcf_hdr_append(args->hdr_out, tmp.s);
+ if ( bcf_hdr_append(args->hdr_out, tmp.s) ) error("[%s:%d] failed to update the header\n",__FILE__,__LINE__);
if (bcf_hdr_sync(args->hdr_out) < 0)
error_errno("[%s] Failed to update header", __func__);
int hdr_id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, hrec->vals[k]);
@@ -2511,7 +2517,7 @@ static void init_columns(args_t *args)
if ( skip_fmt && khash_str2int_has_key(skip_fmt,hrec->vals[k]) ) continue;
tmp.l = 0;
bcf_hrec_format(hrec, &tmp);
- bcf_hdr_append(args->hdr_out, tmp.s);
+ if ( bcf_hdr_append(args->hdr_out, tmp.s) ) error("[%s:%d] failed to update the header\n",__FILE__,__LINE__);
if (bcf_hdr_sync(args->hdr_out) < 0)
error_errno("[%s] Failed to update header", __func__);
int hdr_id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, hrec->vals[k]);
@@ -2559,7 +2565,7 @@ static void init_columns(args_t *args)
if ( !hrec ) error("No such annotation \"%s\" in %s\n", key_src,args->targets_fname);
tmp.l = 0;
bcf_hrec_format_rename(hrec, key_dst, &tmp);
- bcf_hdr_append(args->hdr_out, tmp.s);
+ if ( bcf_hdr_append(args->hdr_out, tmp.s) ) error("[%s:%d] failed to update the header\n",__FILE__,__LINE__);
if (bcf_hdr_sync(args->hdr_out) < 0)
error_errno("[%s] Failed to update header", __func__);
}
@@ -2666,13 +2672,13 @@ static void init_columns(args_t *args)
{
// transferring ID column into a new INFO tag
tmp.l = 0;
- ksprintf(&tmp,"##INFO=",key_dst);
+ ksprintf(&tmp,"##INFO=",key_dst);
}
else if ( !strcasecmp("FILTER",key_src) && !explicit_src_info )
{
// transferring FILTER column into a new INFO tag
tmp.l = 0;
- ksprintf(&tmp,"##INFO=",key_dst);
+ ksprintf(&tmp,"##INFO=",key_dst);
}
else
{
@@ -2692,7 +2698,7 @@ static void init_columns(args_t *args)
tmp.l = 0;
bcf_hrec_format_rename(hrec, key_dst, &tmp);
}
- bcf_hdr_append(args->hdr_out, tmp.s);
+ if ( bcf_hdr_append(args->hdr_out, tmp.s) ) error("[%s:%d] failed to update the header\n",__FILE__,__LINE__);
if (bcf_hdr_sync(args->hdr_out) < 0)
error_errno("[%s] Failed to update header", __func__);
hdr_id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, key_dst);
@@ -3122,6 +3128,11 @@ static void init_data(args_t *args)
&args->index_fn, args->write_index) < 0 )
error("Error: failed to initialise index for %s\n",args->output_fname);
}
+ if ( args->tgts_is_vcf )
+ {
+ if ( args->pair_logic==-1 ) args->pair_logic = BCF_SR_PAIR_SOME;
+ bcf_sr_set_opt(args->files,BCF_SR_PAIR_LOGIC,args->pair_logic);
+ }
}
static void destroy_data(args_t *args)
@@ -3650,7 +3661,7 @@ static void usage(args_t *args)
fprintf(stderr, " --no-version Do not append version and command line to the header\n");
fprintf(stderr, " -o, --output FILE Write output to a file [standard output]\n");
fprintf(stderr, " -O, --output-type u|b|v|z[0-9] u/b: un/compressed BCF, v/z: un/compressed VCF, 0-9: compression level [v]\n");
- fprintf(stderr, " --pair-logic STR Matching records by , see man page for details [some]\n");
+ fprintf(stderr, " --pair-logic STR Matching records by , see man page for details [some]\n");
fprintf(stderr, " -r, --regions REGION Restrict to comma-separated list of regions\n");
fprintf(stderr, " -R, --regions-file FILE Restrict to regions listed in FILE\n");
fprintf(stderr, " --regions-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1]\n");
@@ -3661,6 +3672,7 @@ static void usage(args_t *args)
fprintf(stderr, " --single-overlaps Keep memory low by avoiding complexities arising from handling multiple overlapping intervals\n");
fprintf(stderr, " -x, --remove LIST List of annotations (e.g. ID,INFO/DP,FORMAT/DP,FILTER) to remove (or keep with \"^\" prefix). See man page for details\n");
fprintf(stderr, " --threads INT Number of extra output compression threads [0]\n");
+ fprintf(stderr, " -v, --verbosity INT Verbosity level\n");
fprintf(stderr, " -W, --write-index[=FMT] Automatically index the output files [off]\n");
fprintf(stderr, "\n");
fprintf(stderr, "Examples:\n");
@@ -3718,13 +3730,17 @@ int main_vcfannotate(int argc, char *argv[])
{"min-overlap",required_argument,NULL,12},
{"no-version",no_argument,NULL,8},
{"force",no_argument,NULL,'f'},
+ {"verbosity",required_argument,NULL,'v'},
{"write-index",optional_argument,NULL,'W'},
{NULL,0,NULL,0}
};
char *tmp;
- while ((c = getopt_long(argc, argv, "h:H:?o:O:r:R:a:x:c:C:i:e:S:s:I:m:kl:fW::",loptions,NULL)) >= 0)
+ while ((c = getopt_long(argc, argv, "h:H:?o:O:r:R:a:x:c:C:i:e:S:s:I:m:kl:fW::v:",loptions,NULL)) >= 0)
{
switch (c) {
+ case 'v':
+ if ( apply_verbosity(optarg) < 0 ) error("Could not parse argument: --verbosity %s\n", optarg);
+ break;
case 'f': args->force = 1; break;
case 'k': args->keep_sites = 1; break;
case 'm':
@@ -3784,6 +3800,7 @@ int main_vcfannotate(int argc, char *argv[])
else if ( !strcmp(optarg,"some") ) args->pair_logic |= BCF_SR_PAIR_SOME;
else if ( !strcmp(optarg,"none") ) args->pair_logic = BCF_SR_PAIR_EXACT;
else if ( !strcmp(optarg,"exact") ) args->pair_logic = BCF_SR_PAIR_EXACT;
+ else if ( !strcmp(optarg,"id") ) args->pair_logic |= BCF_SR_PAIR_ID;
else error("The --pair-logic string \"%s\" not recognised.\n", optarg);
break;
case 3 :
@@ -3829,7 +3846,6 @@ int main_vcfannotate(int argc, char *argv[])
{
args->tgts_is_vcf = 1;
args->files->require_index = 1;
- bcf_sr_set_opt(args->files,BCF_SR_PAIR_LOGIC,args->pair_logic>=0 ? args->pair_logic : BCF_SR_PAIR_SOME);
if ( args->min_overlap_str ) error("The --min-overlap option cannot be used when annotating from a VCF\n");
}
}
@@ -3837,10 +3853,19 @@ int main_vcfannotate(int argc, char *argv[])
if ( bcf_sr_set_threads(args->files, args->n_threads)<0 ) error("Failed to create threads\n");
if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to read from %s: %s\n", !strcmp("-",fname)?"standard input":fname,bcf_sr_strerror(args->files->errnum));
- static int line_errcode_warned = 0;
+ static int line_errcode_warned = 0, vcf_parse_error_warned = 0;
init_data(args);
while ( bcf_sr_next_line(args->files) )
{
+ if ( args->files->errnum )
+ {
+ if ( !args->force ) error("Error: %s\n", bcf_sr_strerror(args->files->errnum));
+ else if ( !vcf_parse_error_warned )
+ {
+ fprintf(stderr,"Warning: Encountered an error, proceeding only because --force was given.\n");
+ vcf_parse_error_warned = 1;
+ }
+ }
if ( !bcf_sr_has_line(args->files,0) ) continue;
bcf1_t *line = bcf_sr_get_line(args->files,0);
if ( line->errcode )
diff --git a/bcftools/vcfannotate.c.pysam.c b/bcftools/vcfannotate.c.pysam.c
index 3d4d75ee..b4aeb56c 100644
--- a/bcftools/vcfannotate.c.pysam.c
+++ b/bcftools/vcfannotate.c.pysam.c
@@ -2,7 +2,7 @@
/* vcfannotate.c -- Annotate and edit VCF/BCF files.
- Copyright (C) 2013-2024 Genome Research Ltd.
+ Copyright (C) 2013-2025 Genome Research Ltd.
Author: Petr Danecek
@@ -776,6 +776,7 @@ static int vcf_setter_info_flag(args_t *args, bcf1_t *line, annot_col_t *col, vo
}
static int setter_ARinfo_int32(args_t *args, bcf1_t *line, annot_col_t *col, int nals, char **als, int ntmpi)
{
+ if ( !nals ) error("Cannot fill Number=R,A tags without --columns ..,REF,ALT,..\n");
if ( col->number==BCF_VL_A && ntmpi!=nals-1 && (ntmpi!=1 || args->tmpi[0]!=bcf_int32_missing || args->tmpi[1]!=bcf_int32_vector_end) )
error("Incorrect number of values (%d) for the %s tag at %s:%"PRId64"\n", ntmpi,col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1);
else if ( col->number==BCF_VL_R && ntmpi!=nals && (ntmpi!=1 || args->tmpi[0]!=bcf_int32_missing || args->tmpi[1]!=bcf_int32_vector_end) )
@@ -857,8 +858,8 @@ static int setter_info_int(args_t *args, bcf1_t *line, annot_col_t *col, void *d
else
{
args->tmpi[ntmpi-1] = strtol(str, &end, 10);
- if ( end==str )
- error("Could not parse %s at %s:%"PRId64" .. [%s]\n", col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1,tab->cols[col->icol]);
+ if ( end==str || (*end && *end!=',') )
+ error("Could not parse %s (Type=Integer) at %s:%"PRId64" .. [%s]\n", col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1,tab->cols[col->icol]);
str = end+1;
}
}
@@ -940,6 +941,7 @@ static int vcf_setter_info_int(args_t *args, bcf1_t *line, annot_col_t *col, voi
}
static int setter_ARinfo_real(args_t *args, bcf1_t *line, annot_col_t *col, int nals, char **als, int ntmpf)
{
+ if ( !nals ) error("Cannot fill Number=R,A tags without --columns ..,REF,ALT,..\n");
if ( col->number==BCF_VL_A && ntmpf!=nals-1 && (ntmpf!=1 || !bcf_float_is_missing(args->tmpf[0]) || !bcf_float_is_vector_end(args->tmpf[0])) )
error("Incorrect number of values (%d) for the %s tag at %s:%"PRId64"\n", ntmpf,col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1);
else if ( col->number==BCF_VL_R && ntmpf!=nals && (ntmpf!=1 || !bcf_float_is_missing(args->tmpf[0]) || !bcf_float_is_vector_end(args->tmpf[0])) )
@@ -1126,6 +1128,7 @@ int copy_string_field(char *src, int isrc, int src_len, kstring_t *dst, int idst
static int setter_ARinfo_string(args_t *args, bcf1_t *line, annot_col_t *col, int nals, char **als)
{
assert( col->merge_method==MM_FIRST );
+ if ( !nals ) error("Cannot fill Number=R,A tags without --columns ..,REF,ALT,..\n");
int nsrc = 1, lsrc = 0;
while ( args->tmps[lsrc] )
@@ -1670,8 +1673,8 @@ static int setter_format_int(args_t *args, bcf1_t *line, annot_col_t *col, void
char *end = str;
ptr[ival] = strtol(str, &end, 10);
- if ( end==str )
- error("Could not parse %s at %s:%"PRId64" .. [%s]\n", col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1,tab->cols[col->icol]);
+ if ( end==str || (*end && *end!=',') )
+ error("Could not parse %s (Type=Integer) at %s:%"PRId64" .. [%s]\n", col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1,tab->cols[col->icol]);
ival++;
str = *end ? end+1 : end;
@@ -2315,7 +2318,7 @@ static void init_columns(args_t *args)
col->hdr_key_src = strdup(str.s);
col->hdr_key_dst = strdup(str.s);
col->replace = replace;
- if ( args->pair_logic==-1 ) bcf_sr_set_opt(args->files,BCF_SR_PAIR_LOGIC,BCF_SR_PAIR_BOTH_REF);
+ if ( args->pair_logic==-1 ) args->pair_logic = BCF_SR_PAIR_ANY;
}
else args->alt_idx = icol;
}
@@ -2323,7 +2326,6 @@ static void init_columns(args_t *args)
{
if ( replace & REPLACE_NON_MISSING ) error("Apologies, the -ID feature has not been implemented yet.\n");
if ( str.s[0]=='~' ) replace = MATCH_VALUE;
- if ( args->tgts_is_vcf && (replace & MATCH_VALUE) ) error("todo: -c ~ID with -a VCF?\n");
args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols);
annot_col_t *col = &args->cols[args->ncols-1];
memset(col,0,sizeof(*col));
@@ -2332,7 +2334,11 @@ static void init_columns(args_t *args)
col->setter = args->tgts_is_vcf ? vcf_setter_id : setter_id;
col->hdr_key_src = strdup(str.s);
col->hdr_key_dst = strdup(str.s);
- if ( replace & MATCH_VALUE ) args->match_id = icol;
+ if ( replace & MATCH_VALUE )
+ {
+ args->match_id = icol;
+ if ( args->tgts_is_vcf ) args->pair_logic = (args->pair_logic==-1) ? BCF_SR_PAIR_ID : args->pair_logic|BCF_SR_PAIR_ID;
+ }
}
else if ( !strcasecmp("~INFO/END",str.s) && !args->tgts_is_vcf )
{
@@ -2410,8 +2416,8 @@ static void init_columns(args_t *args)
col->hdr_key_src = strdup(ptr+2);
col->hdr_key_dst = strdup(str.s+5);
tmp.l = 0;
- ksprintf(&tmp,"##INFO=",col->hdr_key_dst);
- bcf_hdr_append(args->hdr_out, tmp.s);
+ ksprintf(&tmp,"##INFO=",col->hdr_key_dst);
+ if ( bcf_hdr_append(args->hdr_out, tmp.s) ) error("[%s:%d] failed to update the header\n",__FILE__,__LINE__);
if (bcf_hdr_sync(args->hdr_out) < 0) error_errno("[%s] Failed to update header", __func__);
int hdr_id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, col->hdr_key_dst);
col->number = bcf_hdr_id2length(args->hdr_out,BCF_HL_INFO,hdr_id);
@@ -2443,7 +2449,7 @@ static void init_columns(args_t *args)
if ( k<0 ) error("[%s] Failed to parse the header, the ID attribute not found", __func__);
tmp.l = 0;
bcf_hrec_format(hrec, &tmp);
- bcf_hdr_append(args->hdr_out, tmp.s);
+ if ( bcf_hdr_append(args->hdr_out, tmp.s) ) error("[%s:%d] failed to update the header\n",__FILE__,__LINE__);
}
if (bcf_hdr_sync(args->hdr_out) < 0)
error_errno("[%s] Failed to update header", __func__);
@@ -2477,7 +2483,7 @@ static void init_columns(args_t *args)
if ( skip_info && khash_str2int_has_key(skip_info,hrec->vals[k]) ) continue;
tmp.l = 0;
bcf_hrec_format(hrec, &tmp);
- bcf_hdr_append(args->hdr_out, tmp.s);
+ if ( bcf_hdr_append(args->hdr_out, tmp.s) ) error("[%s:%d] failed to update the header\n",__FILE__,__LINE__);
if (bcf_hdr_sync(args->hdr_out) < 0)
error_errno("[%s] Failed to update header", __func__);
int hdr_id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, hrec->vals[k]);
@@ -2513,7 +2519,7 @@ static void init_columns(args_t *args)
if ( skip_fmt && khash_str2int_has_key(skip_fmt,hrec->vals[k]) ) continue;
tmp.l = 0;
bcf_hrec_format(hrec, &tmp);
- bcf_hdr_append(args->hdr_out, tmp.s);
+ if ( bcf_hdr_append(args->hdr_out, tmp.s) ) error("[%s:%d] failed to update the header\n",__FILE__,__LINE__);
if (bcf_hdr_sync(args->hdr_out) < 0)
error_errno("[%s] Failed to update header", __func__);
int hdr_id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, hrec->vals[k]);
@@ -2561,7 +2567,7 @@ static void init_columns(args_t *args)
if ( !hrec ) error("No such annotation \"%s\" in %s\n", key_src,args->targets_fname);
tmp.l = 0;
bcf_hrec_format_rename(hrec, key_dst, &tmp);
- bcf_hdr_append(args->hdr_out, tmp.s);
+ if ( bcf_hdr_append(args->hdr_out, tmp.s) ) error("[%s:%d] failed to update the header\n",__FILE__,__LINE__);
if (bcf_hdr_sync(args->hdr_out) < 0)
error_errno("[%s] Failed to update header", __func__);
}
@@ -2668,13 +2674,13 @@ static void init_columns(args_t *args)
{
// transferring ID column into a new INFO tag
tmp.l = 0;
- ksprintf(&tmp,"##INFO=",key_dst);
+ ksprintf(&tmp,"##INFO=",key_dst);
}
else if ( !strcasecmp("FILTER",key_src) && !explicit_src_info )
{
// transferring FILTER column into a new INFO tag
tmp.l = 0;
- ksprintf(&tmp,"##INFO=",key_dst);
+ ksprintf(&tmp,"##INFO=",key_dst);
}
else
{
@@ -2694,7 +2700,7 @@ static void init_columns(args_t *args)
tmp.l = 0;
bcf_hrec_format_rename(hrec, key_dst, &tmp);
}
- bcf_hdr_append(args->hdr_out, tmp.s);
+ if ( bcf_hdr_append(args->hdr_out, tmp.s) ) error("[%s:%d] failed to update the header\n",__FILE__,__LINE__);
if (bcf_hdr_sync(args->hdr_out) < 0)
error_errno("[%s] Failed to update header", __func__);
hdr_id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, key_dst);
@@ -3124,6 +3130,11 @@ static void init_data(args_t *args)
&args->index_fn, args->write_index) < 0 )
error("Error: failed to initialise index for %s\n",args->output_fname);
}
+ if ( args->tgts_is_vcf )
+ {
+ if ( args->pair_logic==-1 ) args->pair_logic = BCF_SR_PAIR_SOME;
+ bcf_sr_set_opt(args->files,BCF_SR_PAIR_LOGIC,args->pair_logic);
+ }
}
static void destroy_data(args_t *args)
@@ -3652,7 +3663,7 @@ static void usage(args_t *args)
fprintf(bcftools_stderr, " --no-version Do not append version and command line to the header\n");
fprintf(bcftools_stderr, " -o, --output FILE Write output to a file [standard output]\n");
fprintf(bcftools_stderr, " -O, --output-type u|b|v|z[0-9] u/b: un/compressed BCF, v/z: un/compressed VCF, 0-9: compression level [v]\n");
- fprintf(bcftools_stderr, " --pair-logic STR Matching records by , see man page for details [some]\n");
+ fprintf(bcftools_stderr, " --pair-logic STR Matching records by , see man page for details [some]\n");
fprintf(bcftools_stderr, " -r, --regions REGION Restrict to comma-separated list of regions\n");
fprintf(bcftools_stderr, " -R, --regions-file FILE Restrict to regions listed in FILE\n");
fprintf(bcftools_stderr, " --regions-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1]\n");
@@ -3663,6 +3674,7 @@ static void usage(args_t *args)
fprintf(bcftools_stderr, " --single-overlaps Keep memory low by avoiding complexities arising from handling multiple overlapping intervals\n");
fprintf(bcftools_stderr, " -x, --remove LIST List of annotations (e.g. ID,INFO/DP,FORMAT/DP,FILTER) to remove (or keep with \"^\" prefix). See man page for details\n");
fprintf(bcftools_stderr, " --threads INT Number of extra output compression threads [0]\n");
+ fprintf(bcftools_stderr, " -v, --verbosity INT Verbosity level\n");
fprintf(bcftools_stderr, " -W, --write-index[=FMT] Automatically index the output files [off]\n");
fprintf(bcftools_stderr, "\n");
fprintf(bcftools_stderr, "Examples:\n");
@@ -3720,13 +3732,17 @@ int main_vcfannotate(int argc, char *argv[])
{"min-overlap",required_argument,NULL,12},
{"no-version",no_argument,NULL,8},
{"force",no_argument,NULL,'f'},
+ {"verbosity",required_argument,NULL,'v'},
{"write-index",optional_argument,NULL,'W'},
{NULL,0,NULL,0}
};
char *tmp;
- while ((c = getopt_long(argc, argv, "h:H:?o:O:r:R:a:x:c:C:i:e:S:s:I:m:kl:fW::",loptions,NULL)) >= 0)
+ while ((c = getopt_long(argc, argv, "h:H:?o:O:r:R:a:x:c:C:i:e:S:s:I:m:kl:fW::v:",loptions,NULL)) >= 0)
{
switch (c) {
+ case 'v':
+ if ( apply_verbosity(optarg) < 0 ) error("Could not parse argument: --verbosity %s\n", optarg);
+ break;
case 'f': args->force = 1; break;
case 'k': args->keep_sites = 1; break;
case 'm':
@@ -3786,6 +3802,7 @@ int main_vcfannotate(int argc, char *argv[])
else if ( !strcmp(optarg,"some") ) args->pair_logic |= BCF_SR_PAIR_SOME;
else if ( !strcmp(optarg,"none") ) args->pair_logic = BCF_SR_PAIR_EXACT;
else if ( !strcmp(optarg,"exact") ) args->pair_logic = BCF_SR_PAIR_EXACT;
+ else if ( !strcmp(optarg,"id") ) args->pair_logic |= BCF_SR_PAIR_ID;
else error("The --pair-logic string \"%s\" not recognised.\n", optarg);
break;
case 3 :
@@ -3831,7 +3848,6 @@ int main_vcfannotate(int argc, char *argv[])
{
args->tgts_is_vcf = 1;
args->files->require_index = 1;
- bcf_sr_set_opt(args->files,BCF_SR_PAIR_LOGIC,args->pair_logic>=0 ? args->pair_logic : BCF_SR_PAIR_SOME);
if ( args->min_overlap_str ) error("The --min-overlap option cannot be used when annotating from a VCF\n");
}
}
@@ -3839,10 +3855,19 @@ int main_vcfannotate(int argc, char *argv[])
if ( bcf_sr_set_threads(args->files, args->n_threads)<0 ) error("Failed to create threads\n");
if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to read from %s: %s\n", !strcmp("-",fname)?"standard input":fname,bcf_sr_strerror(args->files->errnum));
- static int line_errcode_warned = 0;
+ static int line_errcode_warned = 0, vcf_parse_error_warned = 0;
init_data(args);
while ( bcf_sr_next_line(args->files) )
{
+ if ( args->files->errnum )
+ {
+ if ( !args->force ) error("Error: %s\n", bcf_sr_strerror(args->files->errnum));
+ else if ( !vcf_parse_error_warned )
+ {
+ fprintf(bcftools_stderr,"Warning: Encountered an error, proceeding only because --force was given.\n");
+ vcf_parse_error_warned = 1;
+ }
+ }
if ( !bcf_sr_has_line(args->files,0) ) continue;
bcf1_t *line = bcf_sr_get_line(args->files,0);
if ( line->errcode )
diff --git a/bcftools/vcfbuf.c b/bcftools/vcfbuf.c
index 22390d0f..4dee727e 100644
--- a/bcftools/vcfbuf.c
+++ b/bcftools/vcfbuf.c
@@ -1,6 +1,6 @@
/* The MIT License
- Copyright (c) 2016-2024 Genome Research Ltd.
+ Copyright (c) 2016-2025 Genome Research Ltd.
Author: Petr Danecek
@@ -49,9 +49,9 @@ typedef struct
}
vcfrec_t;
-#define PRUNE_MODE_MAX_AF 1
-#define PRUNE_MODE_1ST 2
-#define PRUNE_MODE_RAND 3
+#define PRUNE_MODE_MAX_AF 1
+#define PRUNE_MODE_1ST 2
+#define PRUNE_MODE_RAND 3
typedef struct
{
int max_sites, mvrec, mac, mfarr, mode;
@@ -62,6 +62,18 @@ typedef struct
}
prune_t;
+#define CLUSTER_MODE_PRUNE 1 // remove cluster
+#define CLUSTER_MODE_SIZE 2 // make cluster size available via vcfbuf_get_val(buf,int,CLUSTER_SIZE);
+typedef struct
+{
+ int max_sites; // used with CLUSTER_PRUNE, removes cluster with more than this many sites within the window
+ int mode; // one of CLUSTER_MODE_PRUNE or CLUSTER_MODE_SIZE
+ int last; // the value of the currently removed element
+ int *size; // cluster size for this site
+ rbuf_t rbuf;
+}
+cluster_t;
+
#define MARK_OVERLAP 1
#define MARK_DUP 2
@@ -108,7 +120,7 @@ mark_t;
struct _vcfbuf_t
{
- int win, // maximum number of sites in the buffer, either number of sites (<0) or bp (<0)
+ int win, // maximum number of sites in the buffer, either number of sites (>0) or bp (<0)
dummy; // the caller maintains the buffer via push/peek/flush
bcf_hdr_t *hdr;
vcfrec_t *vcf;
@@ -116,6 +128,7 @@ struct _vcfbuf_t
ld_t ld;
prune_t prune;
mark_t mark;
+ cluster_t cluster;
enum { clean, dirty } status;
};
@@ -129,6 +142,8 @@ vcfbuf_t *vcfbuf_init(bcf_hdr_t *hdr, int win)
int i;
for (i=0; ild.max[i] = HUGE_VAL;
rbuf_init(&buf->rbuf, 0);
+ rbuf_init(&buf->mark.rbuf, 0);
+ rbuf_init(&buf->cluster.rbuf, 0);
return buf;
}
@@ -149,6 +164,7 @@ void vcfbuf_destroy(vcfbuf_t *buf)
free(buf->mark.buf);
free(buf->mark.buf_ptr);
free(buf->mark.tmpi);
+ free(buf->cluster.size);
free(buf);
}
@@ -193,6 +209,20 @@ int vcfbuf_set(vcfbuf_t *buf, vcfbuf_opt_t key, ...)
va_end(args);
return 0;
+ case CLUSTER_PRUNE:
+ va_start(args, key);
+ buf->cluster.max_sites = va_arg(args,int);
+ buf->cluster.mode = CLUSTER_MODE_PRUNE;
+ va_end(args);
+ return 0;
+
+ case CLUSTER_SIZE:
+ va_start(args, key);
+ buf->cluster.max_sites = va_arg(args,int);
+ buf->cluster.mode = CLUSTER_MODE_SIZE;
+ va_end(args);
+ return 0;
+
case PRUNE_NSITES:
va_start(args, key);
buf->prune.max_sites = va_arg(args,int);
@@ -252,6 +282,8 @@ void *vcfbuf_get(vcfbuf_t *buf, vcfbuf_opt_t key, ...)
va_start(args, key);
if ( key==MARK )
return &buf->mark.last;
+ if ( key==CLUSTER_SIZE )
+ return &buf->cluster.last;
va_end(args);
return NULL;
}
@@ -638,6 +670,106 @@ static int mark_expr_can_flush_(vcfbuf_t *buf, int flush_all)
return 1;
}
+int cluster_can_flush_(vcfbuf_t *buf, int flush_all)
+{
+ cluster_t *cluster = &buf->cluster;
+
+//{ int i; i=-1; while ( rbuf_next(&buf->rbuf,&i) ) fprintf(stderr," %d",(int)buf->vcf[i].rec->pos+1); fprintf(stderr," .. dirty=%d flush_all=%d\n",buf->status,flush_all); }
+ if ( buf->status==dirty )
+ {
+ // a new site was just added by vcfbuf_push()
+ rbuf_expand0(&cluster->rbuf, int, buf->rbuf.n, cluster->size);
+ int i = rbuf_append(&cluster->rbuf);
+ cluster->size[i] = 0;
+ }
+ assert( cluster->rbuf.n==buf->rbuf.n );
+
+ // The following cases can occur:
+ // - if flush_all is set, then the entire buffer must be within the window
+ // - else if the last record can be on a different chr, then everything before can be flushed
+ // - else the last record can be either within or outside the window with respect to the first record
+
+
+ if ( buf->status==dirty )
+ {
+ int ib = 0;
+ while ( ib < buf->rbuf.n )
+ {
+ int b = rbuf_kth(&buf->rbuf, ib);
+ int ie = ib + 1;
+ while ( ie < buf->rbuf.n )
+ {
+ int e = rbuf_kth(&buf->rbuf, ie);
+ if ( buf->vcf[b].rec->rid != buf->vcf[e].rec->rid ) break;
+ if ( buf->vcf[e].rec->pos - buf->vcf[b].rec->pos + 1 > -buf->win ) break; // win is negative
+ ie++;
+ }
+ // now ie is just outside the window or beyond the last element of the window
+
+ // count the number of unfiltered sites that contribute to the cluster. Note this is inefficient,
+ // recalculating the same bits over and over, should be improved..
+ int ix, nbuf = 0;
+ for (ix=ib; ixrbuf, ix);
+ if ( buf->vcf[x].filter ) continue;
+ nbuf++;
+ }
+ for (ix=ib; ixrbuf, ix);
+ if ( cluster->size[x] < nbuf ) cluster->size[x] = nbuf;
+ }
+ ib++;
+ }
+ buf->status = clean;
+ }
+
+ int b = rbuf_kth(&buf->rbuf, 0); // first
+ int e = rbuf_last(&buf->rbuf); // last
+ int can_flush = flush_all;
+ if ( buf->vcf[b].rec->rid != buf->vcf[e].rec->rid ) can_flush = 1;
+ if ( buf->vcf[e].rec->pos - buf->vcf[b].rec->pos + 1 > -buf->win ) can_flush = 1;
+ if ( !can_flush ) return 0;
+
+ if ( buf->cluster.mode==CLUSTER_MODE_PRUNE )
+ {
+ int flush = 0;
+ while ( buf->rbuf.n )
+ {
+ flush = 0;
+ int b = rbuf_kth(&buf->rbuf, 0);
+ int e = rbuf_kth(&buf->rbuf, -1);
+ if ( buf->vcf[b].filter )
+ {
+ // not to be pruned, not counted as part of the cluster
+ flush = 1;
+ break;
+ }
+
+ if ( flush_all ) flush = 1;
+ else if ( buf->vcf[b].rec->rid != buf->vcf[e].rec->rid ) flush = 1;
+ else if ( buf->vcf[e].rec->pos - buf->vcf[b].rec->pos + 1 > -buf->win ) flush = 1;
+ if ( !flush ) break;
+
+ b = rbuf_kth(&cluster->rbuf, 0);
+ if ( cluster->size[b] <= cluster->max_sites ) break; // not to be pruned
+
+ rbuf_remove_kth(&buf->rbuf, vcfrec_t, 0, buf->vcf);
+ rbuf_remove_kth(&cluster->rbuf, int, 0, cluster->size);
+ }
+ if ( !flush ) return 0;
+ }
+
+ if ( !cluster->rbuf.n ) return 0;
+
+ b = rbuf_shift(&cluster->rbuf);
+ cluster->last = cluster->size[b];
+ b = rbuf_kth(&buf->rbuf, 0);
+ if ( buf->vcf[b].filter ) cluster->last = 0;
+ return 1;
+}
+
bcf1_t *vcfbuf_flush(vcfbuf_t *buf, int flush_all)
{
int i,j;
@@ -648,6 +780,13 @@ bcf1_t *vcfbuf_flush(vcfbuf_t *buf, int flush_all)
// dummy mode, always flushing
if ( buf->dummy ) goto ret;
+ // either annotate or print clustered sites
+ if ( buf->cluster.mode )
+ {
+ if ( !cluster_can_flush_(buf,flush_all) ) return NULL;
+ goto ret;
+ }
+
// pruning mode
if ( buf->win )
{
diff --git a/bcftools/vcfbuf.c.pysam.c b/bcftools/vcfbuf.c.pysam.c
index b74a5c49..ee7c9a70 100644
--- a/bcftools/vcfbuf.c.pysam.c
+++ b/bcftools/vcfbuf.c.pysam.c
@@ -2,7 +2,7 @@
/* The MIT License
- Copyright (c) 2016-2024 Genome Research Ltd.
+ Copyright (c) 2016-2025 Genome Research Ltd.
Author: Petr Danecek
@@ -51,9 +51,9 @@ typedef struct
}
vcfrec_t;
-#define PRUNE_MODE_MAX_AF 1
-#define PRUNE_MODE_1ST 2
-#define PRUNE_MODE_RAND 3
+#define PRUNE_MODE_MAX_AF 1
+#define PRUNE_MODE_1ST 2
+#define PRUNE_MODE_RAND 3
typedef struct
{
int max_sites, mvrec, mac, mfarr, mode;
@@ -64,6 +64,18 @@ typedef struct
}
prune_t;
+#define CLUSTER_MODE_PRUNE 1 // remove cluster
+#define CLUSTER_MODE_SIZE 2 // make cluster size available via vcfbuf_get_val(buf,int,CLUSTER_SIZE);
+typedef struct
+{
+ int max_sites; // used with CLUSTER_PRUNE, removes cluster with more than this many sites within the window
+ int mode; // one of CLUSTER_MODE_PRUNE or CLUSTER_MODE_SIZE
+ int last; // the value of the currently removed element
+ int *size; // cluster size for this site
+ rbuf_t rbuf;
+}
+cluster_t;
+
#define MARK_OVERLAP 1
#define MARK_DUP 2
@@ -110,7 +122,7 @@ mark_t;
struct _vcfbuf_t
{
- int win, // maximum number of sites in the buffer, either number of sites (<0) or bp (<0)
+ int win, // maximum number of sites in the buffer, either number of sites (>0) or bp (<0)
dummy; // the caller maintains the buffer via push/peek/flush
bcf_hdr_t *hdr;
vcfrec_t *vcf;
@@ -118,6 +130,7 @@ struct _vcfbuf_t
ld_t ld;
prune_t prune;
mark_t mark;
+ cluster_t cluster;
enum { clean, dirty } status;
};
@@ -131,6 +144,8 @@ vcfbuf_t *vcfbuf_init(bcf_hdr_t *hdr, int win)
int i;
for (i=0; ild.max[i] = HUGE_VAL;
rbuf_init(&buf->rbuf, 0);
+ rbuf_init(&buf->mark.rbuf, 0);
+ rbuf_init(&buf->cluster.rbuf, 0);
return buf;
}
@@ -151,6 +166,7 @@ void vcfbuf_destroy(vcfbuf_t *buf)
free(buf->mark.buf);
free(buf->mark.buf_ptr);
free(buf->mark.tmpi);
+ free(buf->cluster.size);
free(buf);
}
@@ -195,6 +211,20 @@ int vcfbuf_set(vcfbuf_t *buf, vcfbuf_opt_t key, ...)
va_end(args);
return 0;
+ case CLUSTER_PRUNE:
+ va_start(args, key);
+ buf->cluster.max_sites = va_arg(args,int);
+ buf->cluster.mode = CLUSTER_MODE_PRUNE;
+ va_end(args);
+ return 0;
+
+ case CLUSTER_SIZE:
+ va_start(args, key);
+ buf->cluster.max_sites = va_arg(args,int);
+ buf->cluster.mode = CLUSTER_MODE_SIZE;
+ va_end(args);
+ return 0;
+
case PRUNE_NSITES:
va_start(args, key);
buf->prune.max_sites = va_arg(args,int);
@@ -254,6 +284,8 @@ void *vcfbuf_get(vcfbuf_t *buf, vcfbuf_opt_t key, ...)
va_start(args, key);
if ( key==MARK )
return &buf->mark.last;
+ if ( key==CLUSTER_SIZE )
+ return &buf->cluster.last;
va_end(args);
return NULL;
}
@@ -640,6 +672,106 @@ static int mark_expr_can_flush_(vcfbuf_t *buf, int flush_all)
return 1;
}
+int cluster_can_flush_(vcfbuf_t *buf, int flush_all)
+{
+ cluster_t *cluster = &buf->cluster;
+
+//{ int i; i=-1; while ( rbuf_next(&buf->rbuf,&i) ) fprintf(bcftools_stderr," %d",(int)buf->vcf[i].rec->pos+1); fprintf(bcftools_stderr," .. dirty=%d flush_all=%d\n",buf->status,flush_all); }
+ if ( buf->status==dirty )
+ {
+ // a new site was just added by vcfbuf_push()
+ rbuf_expand0(&cluster->rbuf, int, buf->rbuf.n, cluster->size);
+ int i = rbuf_append(&cluster->rbuf);
+ cluster->size[i] = 0;
+ }
+ assert( cluster->rbuf.n==buf->rbuf.n );
+
+ // The following cases can occur:
+ // - if flush_all is set, then the entire buffer must be within the window
+ // - else if the last record can be on a different chr, then everything before can be flushed
+ // - else the last record can be either within or outside the window with respect to the first record
+
+
+ if ( buf->status==dirty )
+ {
+ int ib = 0;
+ while ( ib < buf->rbuf.n )
+ {
+ int b = rbuf_kth(&buf->rbuf, ib);
+ int ie = ib + 1;
+ while ( ie < buf->rbuf.n )
+ {
+ int e = rbuf_kth(&buf->rbuf, ie);
+ if ( buf->vcf[b].rec->rid != buf->vcf[e].rec->rid ) break;
+ if ( buf->vcf[e].rec->pos - buf->vcf[b].rec->pos + 1 > -buf->win ) break; // win is negative
+ ie++;
+ }
+ // now ie is just outside the window or beyond the last element of the window
+
+ // count the number of unfiltered sites that contribute to the cluster. Note this is inefficient,
+ // recalculating the same bits over and over, should be improved..
+ int ix, nbuf = 0;
+ for (ix=ib; ixrbuf, ix);
+ if ( buf->vcf[x].filter ) continue;
+ nbuf++;
+ }
+ for (ix=ib; ixrbuf, ix);
+ if ( cluster->size[x] < nbuf ) cluster->size[x] = nbuf;
+ }
+ ib++;
+ }
+ buf->status = clean;
+ }
+
+ int b = rbuf_kth(&buf->rbuf, 0); // first
+ int e = rbuf_last(&buf->rbuf); // last
+ int can_flush = flush_all;
+ if ( buf->vcf[b].rec->rid != buf->vcf[e].rec->rid ) can_flush = 1;
+ if ( buf->vcf[e].rec->pos - buf->vcf[b].rec->pos + 1 > -buf->win ) can_flush = 1;
+ if ( !can_flush ) return 0;
+
+ if ( buf->cluster.mode==CLUSTER_MODE_PRUNE )
+ {
+ int flush = 0;
+ while ( buf->rbuf.n )
+ {
+ flush = 0;
+ int b = rbuf_kth(&buf->rbuf, 0);
+ int e = rbuf_kth(&buf->rbuf, -1);
+ if ( buf->vcf[b].filter )
+ {
+ // not to be pruned, not counted as part of the cluster
+ flush = 1;
+ break;
+ }
+
+ if ( flush_all ) flush = 1;
+ else if ( buf->vcf[b].rec->rid != buf->vcf[e].rec->rid ) flush = 1;
+ else if ( buf->vcf[e].rec->pos - buf->vcf[b].rec->pos + 1 > -buf->win ) flush = 1;
+ if ( !flush ) break;
+
+ b = rbuf_kth(&cluster->rbuf, 0);
+ if ( cluster->size[b] <= cluster->max_sites ) break; // not to be pruned
+
+ rbuf_remove_kth(&buf->rbuf, vcfrec_t, 0, buf->vcf);
+ rbuf_remove_kth(&cluster->rbuf, int, 0, cluster->size);
+ }
+ if ( !flush ) return 0;
+ }
+
+ if ( !cluster->rbuf.n ) return 0;
+
+ b = rbuf_shift(&cluster->rbuf);
+ cluster->last = cluster->size[b];
+ b = rbuf_kth(&buf->rbuf, 0);
+ if ( buf->vcf[b].filter ) cluster->last = 0;
+ return 1;
+}
+
bcf1_t *vcfbuf_flush(vcfbuf_t *buf, int flush_all)
{
int i,j;
@@ -650,6 +782,13 @@ bcf1_t *vcfbuf_flush(vcfbuf_t *buf, int flush_all)
// dummy mode, always flushing
if ( buf->dummy ) goto ret;
+ // either annotate or print clustered sites
+ if ( buf->cluster.mode )
+ {
+ if ( !cluster_can_flush_(buf,flush_all) ) return NULL;
+ goto ret;
+ }
+
// pruning mode
if ( buf->win )
{
diff --git a/bcftools/vcfbuf.h b/bcftools/vcfbuf.h
index 96d7115c..15054ff3 100644
--- a/bcftools/vcfbuf.h
+++ b/bcftools/vcfbuf.h
@@ -1,6 +1,6 @@
/* The MIT License
- Copyright (c) 2017-2024 Genome Research Ltd.
+ Copyright (c) 2017-2025 Genome Research Ltd.
Author: Petr Danecek
@@ -45,6 +45,11 @@ typedef enum
PRUNE_NSITES_MODE, // char *, maxAF (keep sites with max AF), 1st (sites that come first), rand (pick randomly)
PRUNE_AF_TAG, // char *, use this INFO/AF tag with VCFBUF_NSITES
+ CLUSTER_PRUNE, // int, remove clusters of more than this many sites within the window
+ CLUSTER_SIZE, // w: int, if set, vcfbuf_get_val(buf,int,CLUSTER_SIZE) will be returning the cluster size
+ // r: use as in the example for MARK below. Returns positive values for valid sites,
+ // 0 for filtered sites
+
// duplicates and overlaps
MARK, // w: char *, resolve overlaps by preferentially removing sites according to EXPR:
// min(QUAL) .. remove sites with lowest QUAL until overlaps are resolved
@@ -134,19 +139,20 @@ int vcfbuf_nsites(vcfbuf_t *buf);
* Returns 0 on success or -1 if no values were filled.
*
* @val: will be filled with the values
- * .. correlation coefficient r-squared
- * .. Lewontin's D' (PMID: 19433632)
- * .. Ragsdale's \hat{D} (doi:10.1093/molbev/msz265)
+ * r2 .. correlation coefficient r-squared
+ * LD .. Lewontin's D' (doi:10.1534/genetics.108.093153)
+ * RD,HD .. Ragsdale's \hat{D} (doi:10.1093/molbev/msz265)
* @rec: corresponding positions or NULL if the value(s) has not been set
*/
#define VCFBUF_LD_N 3
#define VCFBUF_LD_IDX_R2 0
#define VCFBUF_LD_IDX_LD 1
#define VCFBUF_LD_IDX_HD 2
+#define VCFBUF_LD_IDX_RD 2
typedef struct
{
- double val[VCFBUF_LD_N]; // r2, ld, hd
- bcf1_t *rec[VCFBUF_LD_N]; // record with max r2, ld, hd
+ double val[VCFBUF_LD_N]; // r2, ld, rd
+ bcf1_t *rec[VCFBUF_LD_N]; // record with max r2, ld, rd
}
vcfbuf_ld_t;
int vcfbuf_ld(vcfbuf_t *buf, bcf1_t *rec, vcfbuf_ld_t *ld);
diff --git a/bcftools/vcfcall.c b/bcftools/vcfcall.c
index 13e516f8..7ea14366 100644
--- a/bcftools/vcfcall.c
+++ b/bcftools/vcfcall.c
@@ -1,6 +1,6 @@
/* vcfcall.c -- SNP/indel variant calling from VCF/BCF.
- Copyright (C) 2013-2024 Genome Research Ltd.
+ Copyright (C) 2013-2025 Genome Research Ltd.
Author: Petr Danecek
@@ -112,18 +112,15 @@ typedef struct
}
args_t;
-static char **add_sample(void *name2idx, char **lines, int *nlines, int *mlines, char *name, char sex, int *ith)
+static char **add_sample(void *name2idx, char **lines, int *nlines, int *mlines, char *name, char *sex, int *ith)
{
int ret = khash_str2int_get(name2idx, name, ith);
if ( ret==0 ) return lines;
hts_expand(char*,(*nlines+1),*mlines,lines);
- int len = strlen(name);
- lines[*nlines] = (char*) malloc(len+3);
- memcpy(lines[*nlines],name,len);
- lines[*nlines][len] = ' ';
- lines[*nlines][len+1] = sex;
- lines[*nlines][len+2] = 0;
+ kstring_t str = {0,0,0};
+ ksprintf(&str,"%s %s",name,sex);
+ lines[*nlines] = str.s;
*ith = *nlines;
(*nlines)++;
khash_str2int_set(name2idx, strdup(name), *ith);
@@ -205,12 +202,14 @@ static ploidy_predef_t ploidy_predefs[] =
// only 5 columns are required and the first is ignored:
// ignored,sample,father(or 0),mother(or 0),sex(1=M,2=F)
-static char **parse_ped_samples(call_t *call, char **vals, int nvals, int *nsmpl)
+static char **parse_ped_samples(args_t *args, call_t *call, char **vals, int nvals, int *nsmpl)
{
int i, j, mlines = 0, nlines = 0;
kstring_t str = {0,0,0}, fam_str = {0,0,0};
void *name2idx = khash_str2int_init();
char **lines = NULL;
+
+ char *msex = "M", *fsex = "F";
for (i=0; iploidy,sex)<0 )
+ {
+ // this gender is not defined, if 1/2, test if M/F is
+ if ( !strcmp(sex,"1") && ploidy_sex2id(args->ploidy,msex)>=0 ) sex = msex;
+ else if ( !strcmp(sex,"2") && ploidy_sex2id(args->ploidy,fsex)>=0 ) sex = fsex;
+ else error("[E::%s] The sex \"%s\" has not been declared in --ploidy/--ploidy-file\n",__func__,sex);
+ }
lines = add_sample(name2idx, lines, &nlines, &mlines, col_ends[0]+1, sex, &j);
if ( strcmp(col_ends[1]+1,"0") && strcmp(col_ends[2]+1,"0") ) // father and mother
{
@@ -248,9 +250,9 @@ static char **parse_ped_samples(call_t *call, char **vals, int nvals, int *nsmpl
fam->name = strdup(fam_str.s);
if ( !khash_str2int_has_key(name2idx, col_ends[1]+1) )
- lines = add_sample(name2idx, lines, &nlines, &mlines, col_ends[1]+1, 'M', &fam->sample[FATHER]);
+ lines = add_sample(name2idx, lines, &nlines, &mlines, col_ends[1]+1, msex, &fam->sample[FATHER]);
if ( !khash_str2int_has_key(name2idx, col_ends[2]+1) )
- lines = add_sample(name2idx, lines, &nlines, &mlines, col_ends[2]+1, 'F', &fam->sample[MOTHER]);
+ lines = add_sample(name2idx, lines, &nlines, &mlines, col_ends[2]+1, fsex, &fam->sample[MOTHER]);
khash_str2int_get(name2idx, col_ends[0]+1, &fam->sample[CHILD]);
khash_str2int_get(name2idx, col_ends[1]+1, &fam->sample[FATHER]);
@@ -276,12 +278,17 @@ static char **parse_ped_samples(call_t *call, char **vals, int nvals, int *nsmpl
*/
static void set_samples(args_t *args, const char *fn, int is_file)
{
- int i, nlines;
+ int i, nlines, negate = 0;
+ if ( fn[0]=='^' )
+ {
+ negate = 1;
+ fn++;
+ }
char **lines = hts_readlist(fn, is_file, &nlines);
if ( !lines ) error("Could not read the file: %s\n", fn);
int nsmpls;
- char **smpls = parse_ped_samples(&args->aux, lines, nlines, &nsmpls);
+ char **smpls = parse_ped_samples(args, &args->aux, lines, nlines, &nsmpls);
if ( smpls )
{
for (i=0; iaux.hdr); i++) args->sample2sex[i] = dflt_sex_id;
int *old2new = (int*) malloc(sizeof(int)*bcf_hdr_nsamples(args->aux.hdr));
- for (i=0; iaux.hdr); i++) old2new[i] = -1;
-
int nsmpl = 0, map_needed = 0;
- for (i=0; iaux.hdr, BCF_DT_SAMPLE, ss);
- if ( ismpl < 0 ) { fprintf(stderr,"Warning: No such sample in the VCF: %s\n",ss); continue; }
- if ( old2new[ismpl] != -1 ) { fprintf(stderr,"Warning: The sample is listed multiple times: %s\n",ss); continue; }
-
- ss = se+(x != '\0');
- while ( *ss && isspace(*ss) ) ss++;
- if ( !*ss ) ss = "2"; // default ploidy
- se = ss;
- while ( *se && !isspace(*se) ) se++;
- if ( se==ss ) { *xptr = x; error("Could not parse: \"%s\"\n", lines[i]); }
-
- if ( ss[1]==0 && (ss[0]=='0' || ss[0]=='1' || ss[0]=='2') )
- args->sample2sex[nsmpl] = -1*(ss[0]-'0');
- else
- args->sample2sex[nsmpl] = ploidy_add_sex(args->ploidy, ss);
+ for (i=0; iaux.hdr); i++) old2new[i] = -1;
+ for (i=0; iaux.hdr, BCF_DT_SAMPLE, ss);
+ if ( ismpl < 0 ) { fprintf(stderr,"Warning: No such sample in the VCF: %s\n",ss); continue; }
+ if ( old2new[ismpl] != -1 ) { fprintf(stderr,"Warning: The sample is listed multiple times: %s\n",ss); continue; }
+
+ ss = se+(x != '\0');
+ while ( *ss && isspace(*ss) ) ss++;
+ if ( !*ss ) ss = "2"; // default ploidy
+ se = ss;
+ while ( *se && !isspace(*se) ) se++;
+ if ( se==ss ) { *xptr = x; error("Could not parse: \"%s\"\n", lines[i]); }
+
+ char *sex = ss;
+ if ( ploidy_sex2id(args->ploidy,sex)<0 )
+ {
+ if ( sex[1]==0 && (sex[0]=='0' || sex[0]=='1' || sex[0]=='2') ) args->sample2sex[nsmpl] = -1*(sex[0]-'0');
+ else error("[E::%s] The sex \"%s\" has not been declared in --ploidy/--ploidy-file\n",__func__,sex);
+ }
+ else
+ args->sample2sex[nsmpl] = ploidy_add_sex(args->ploidy,sex);
- if ( ismpl!=nsmpl ) map_needed = 1;
- args->samples_map[nsmpl] = ismpl;
- old2new[ismpl] = nsmpl;
- nsmpl++;
+ if ( ismpl!=nsmpl ) map_needed = 1;
+ args->samples_map[nsmpl] = ismpl;
+ old2new[ismpl] = nsmpl;
+ nsmpl++;
+ }
+ if ( nsmpl!=bcf_hdr_nsamples(args->aux.hdr) ) map_needed = 1;
+ }
+ else
+ {
+ // negate: in this mode the default ploidy must be used for obvious reason - there is no way to
+ // specify ploidy if the sample name is not shown
+ for (i=0; iaux.hdr); i++) old2new[i] = 1; // by default keep the sample
+ for (i=0; iaux.hdr, BCF_DT_SAMPLE, ss);
+ if ( ismpl < 0 ) { fprintf(stderr,"Warning: No such sample in the VCF: %s\n",ss); continue; }
+
+ old2new[ismpl] = 0; // do not keep this sample
+ free(lines[i]);
+ }
+ free(lines);
+ lines = malloc(sizeof(*lines)*bcf_hdr_nsamples(args->aux.hdr));
+ nsmpl = 0;
+ for (i=0; iaux.hdr); i++)
+ {
+ if ( !old2new[i] ) continue;
+ lines[nsmpl] = strdup(args->aux.hdr->samples[i]);
+ args->samples_map[nsmpl] = i;
+ old2new[i] = nsmpl;
+ nsmpl++;
+ }
+ map_needed = 1;
}
for (i=0; iaux.nfams; i++)
@@ -927,6 +975,7 @@ static void usage(args_t *args)
fprintf(stderr, " -M, --keep-masked-ref Keep sites with masked reference allele (REF=N)\n");
fprintf(stderr, " -V, --skip-variants TYPE Skip indels/snps\n");
fprintf(stderr, " -v, --variants-only Output variant sites only\n");
+ fprintf(stderr, " --verbosity INT Verbosity level\n");
fprintf(stderr, " -W, --write-index[=FMT] Automatically index the output files [off]\n");
fprintf(stderr, "\n");
fprintf(stderr, "Consensus/variant calling options:\n");
@@ -1012,6 +1061,7 @@ int main_vcfcall(int argc, char *argv[])
{"chromosome-Y",no_argument,NULL,'Y'},
{"no-version",no_argument,NULL,8},
{"write-index",optional_argument,NULL,'W'},
+ {"verbosity",required_argument,NULL,10},
{NULL,0,NULL,0}
};
@@ -1103,6 +1153,9 @@ int main_vcfcall(int argc, char *argv[])
if (!(args.write_index = write_index_parse(optarg)))
error("Unsupported index format '%s'\n", optarg);
break;
+ case 10:
+ if ( apply_verbosity(optarg) < 0 ) error("Could not parse argument: --verbosity %s\n", optarg);
+ break;
default: usage(&args);
}
}
diff --git a/bcftools/vcfcall.c.pysam.c b/bcftools/vcfcall.c.pysam.c
index 0da344ef..3dab20d7 100644
--- a/bcftools/vcfcall.c.pysam.c
+++ b/bcftools/vcfcall.c.pysam.c
@@ -2,7 +2,7 @@
/* vcfcall.c -- SNP/indel variant calling from VCF/BCF.
- Copyright (C) 2013-2024 Genome Research Ltd.
+ Copyright (C) 2013-2025 Genome Research Ltd.
Author: Petr Danecek
@@ -114,18 +114,15 @@ typedef struct
}
args_t;
-static char **add_sample(void *name2idx, char **lines, int *nlines, int *mlines, char *name, char sex, int *ith)
+static char **add_sample(void *name2idx, char **lines, int *nlines, int *mlines, char *name, char *sex, int *ith)
{
int ret = khash_str2int_get(name2idx, name, ith);
if ( ret==0 ) return lines;
hts_expand(char*,(*nlines+1),*mlines,lines);
- int len = strlen(name);
- lines[*nlines] = (char*) malloc(len+3);
- memcpy(lines[*nlines],name,len);
- lines[*nlines][len] = ' ';
- lines[*nlines][len+1] = sex;
- lines[*nlines][len+2] = 0;
+ kstring_t str = {0,0,0};
+ ksprintf(&str,"%s %s",name,sex);
+ lines[*nlines] = str.s;
*ith = *nlines;
(*nlines)++;
khash_str2int_set(name2idx, strdup(name), *ith);
@@ -207,12 +204,14 @@ static ploidy_predef_t ploidy_predefs[] =
// only 5 columns are required and the first is ignored:
// ignored,sample,father(or 0),mother(or 0),sex(1=M,2=F)
-static char **parse_ped_samples(call_t *call, char **vals, int nvals, int *nsmpl)
+static char **parse_ped_samples(args_t *args, call_t *call, char **vals, int nvals, int *nsmpl)
{
int i, j, mlines = 0, nlines = 0;
kstring_t str = {0,0,0}, fam_str = {0,0,0};
void *name2idx = khash_str2int_init();
char **lines = NULL;
+
+ char *msex = "M", *fsex = "F";
for (i=0; iploidy,sex)<0 )
+ {
+ // this gender is not defined, if 1/2, test if M/F is
+ if ( !strcmp(sex,"1") && ploidy_sex2id(args->ploidy,msex)>=0 ) sex = msex;
+ else if ( !strcmp(sex,"2") && ploidy_sex2id(args->ploidy,fsex)>=0 ) sex = fsex;
+ else error("[E::%s] The sex \"%s\" has not been declared in --ploidy/--ploidy-file\n",__func__,sex);
+ }
lines = add_sample(name2idx, lines, &nlines, &mlines, col_ends[0]+1, sex, &j);
if ( strcmp(col_ends[1]+1,"0") && strcmp(col_ends[2]+1,"0") ) // father and mother
{
@@ -250,9 +252,9 @@ static char **parse_ped_samples(call_t *call, char **vals, int nvals, int *nsmpl
fam->name = strdup(fam_str.s);
if ( !khash_str2int_has_key(name2idx, col_ends[1]+1) )
- lines = add_sample(name2idx, lines, &nlines, &mlines, col_ends[1]+1, 'M', &fam->sample[FATHER]);
+ lines = add_sample(name2idx, lines, &nlines, &mlines, col_ends[1]+1, msex, &fam->sample[FATHER]);
if ( !khash_str2int_has_key(name2idx, col_ends[2]+1) )
- lines = add_sample(name2idx, lines, &nlines, &mlines, col_ends[2]+1, 'F', &fam->sample[MOTHER]);
+ lines = add_sample(name2idx, lines, &nlines, &mlines, col_ends[2]+1, fsex, &fam->sample[MOTHER]);
khash_str2int_get(name2idx, col_ends[0]+1, &fam->sample[CHILD]);
khash_str2int_get(name2idx, col_ends[1]+1, &fam->sample[FATHER]);
@@ -278,12 +280,17 @@ static char **parse_ped_samples(call_t *call, char **vals, int nvals, int *nsmpl
*/
static void set_samples(args_t *args, const char *fn, int is_file)
{
- int i, nlines;
+ int i, nlines, negate = 0;
+ if ( fn[0]=='^' )
+ {
+ negate = 1;
+ fn++;
+ }
char **lines = hts_readlist(fn, is_file, &nlines);
if ( !lines ) error("Could not read the file: %s\n", fn);
int nsmpls;
- char **smpls = parse_ped_samples(&args->aux, lines, nlines, &nsmpls);
+ char **smpls = parse_ped_samples(args, &args->aux, lines, nlines, &nsmpls);
if ( smpls )
{
for (i=0; iaux.hdr); i++) args->sample2sex[i] = dflt_sex_id;
int *old2new = (int*) malloc(sizeof(int)*bcf_hdr_nsamples(args->aux.hdr));
- for (i=0; iaux.hdr); i++) old2new[i] = -1;
-
int nsmpl = 0, map_needed = 0;
- for (i=0; iaux.hdr, BCF_DT_SAMPLE, ss);
- if ( ismpl < 0 ) { fprintf(bcftools_stderr,"Warning: No such sample in the VCF: %s\n",ss); continue; }
- if ( old2new[ismpl] != -1 ) { fprintf(bcftools_stderr,"Warning: The sample is listed multiple times: %s\n",ss); continue; }
-
- ss = se+(x != '\0');
- while ( *ss && isspace(*ss) ) ss++;
- if ( !*ss ) ss = "2"; // default ploidy
- se = ss;
- while ( *se && !isspace(*se) ) se++;
- if ( se==ss ) { *xptr = x; error("Could not parse: \"%s\"\n", lines[i]); }
-
- if ( ss[1]==0 && (ss[0]=='0' || ss[0]=='1' || ss[0]=='2') )
- args->sample2sex[nsmpl] = -1*(ss[0]-'0');
- else
- args->sample2sex[nsmpl] = ploidy_add_sex(args->ploidy, ss);
+ for (i=0; iaux.hdr); i++) old2new[i] = -1;
+ for (i=0; iaux.hdr, BCF_DT_SAMPLE, ss);
+ if ( ismpl < 0 ) { fprintf(bcftools_stderr,"Warning: No such sample in the VCF: %s\n",ss); continue; }
+ if ( old2new[ismpl] != -1 ) { fprintf(bcftools_stderr,"Warning: The sample is listed multiple times: %s\n",ss); continue; }
+
+ ss = se+(x != '\0');
+ while ( *ss && isspace(*ss) ) ss++;
+ if ( !*ss ) ss = "2"; // default ploidy
+ se = ss;
+ while ( *se && !isspace(*se) ) se++;
+ if ( se==ss ) { *xptr = x; error("Could not parse: \"%s\"\n", lines[i]); }
+
+ char *sex = ss;
+ if ( ploidy_sex2id(args->ploidy,sex)<0 )
+ {
+ if ( sex[1]==0 && (sex[0]=='0' || sex[0]=='1' || sex[0]=='2') ) args->sample2sex[nsmpl] = -1*(sex[0]-'0');
+ else error("[E::%s] The sex \"%s\" has not been declared in --ploidy/--ploidy-file\n",__func__,sex);
+ }
+ else
+ args->sample2sex[nsmpl] = ploidy_add_sex(args->ploidy,sex);
- if ( ismpl!=nsmpl ) map_needed = 1;
- args->samples_map[nsmpl] = ismpl;
- old2new[ismpl] = nsmpl;
- nsmpl++;
+ if ( ismpl!=nsmpl ) map_needed = 1;
+ args->samples_map[nsmpl] = ismpl;
+ old2new[ismpl] = nsmpl;
+ nsmpl++;
+ }
+ if ( nsmpl!=bcf_hdr_nsamples(args->aux.hdr) ) map_needed = 1;
+ }
+ else
+ {
+ // negate: in this mode the default ploidy must be used for obvious reason - there is no way to
+ // specify ploidy if the sample name is not shown
+ for (i=0; iaux.hdr); i++) old2new[i] = 1; // by default keep the sample
+ for (i=0; iaux.hdr, BCF_DT_SAMPLE, ss);
+ if ( ismpl < 0 ) { fprintf(bcftools_stderr,"Warning: No such sample in the VCF: %s\n",ss); continue; }
+
+ old2new[ismpl] = 0; // do not keep this sample
+ free(lines[i]);
+ }
+ free(lines);
+ lines = malloc(sizeof(*lines)*bcf_hdr_nsamples(args->aux.hdr));
+ nsmpl = 0;
+ for (i=0; iaux.hdr); i++)
+ {
+ if ( !old2new[i] ) continue;
+ lines[nsmpl] = strdup(args->aux.hdr->samples[i]);
+ args->samples_map[nsmpl] = i;
+ old2new[i] = nsmpl;
+ nsmpl++;
+ }
+ map_needed = 1;
}
for (i=0; iaux.nfams; i++)
@@ -929,6 +977,7 @@ static void usage(args_t *args)
fprintf(bcftools_stderr, " -M, --keep-masked-ref Keep sites with masked reference allele (REF=N)\n");
fprintf(bcftools_stderr, " -V, --skip-variants TYPE Skip indels/snps\n");
fprintf(bcftools_stderr, " -v, --variants-only Output variant sites only\n");
+ fprintf(bcftools_stderr, " --verbosity INT Verbosity level\n");
fprintf(bcftools_stderr, " -W, --write-index[=FMT] Automatically index the output files [off]\n");
fprintf(bcftools_stderr, "\n");
fprintf(bcftools_stderr, "Consensus/variant calling options:\n");
@@ -1014,6 +1063,7 @@ int main_vcfcall(int argc, char *argv[])
{"chromosome-Y",no_argument,NULL,'Y'},
{"no-version",no_argument,NULL,8},
{"write-index",optional_argument,NULL,'W'},
+ {"verbosity",required_argument,NULL,10},
{NULL,0,NULL,0}
};
@@ -1105,6 +1155,9 @@ int main_vcfcall(int argc, char *argv[])
if (!(args.write_index = write_index_parse(optarg)))
error("Unsupported index format '%s'\n", optarg);
break;
+ case 10:
+ if ( apply_verbosity(optarg) < 0 ) error("Could not parse argument: --verbosity %s\n", optarg);
+ break;
default: usage(&args);
}
}
diff --git a/bcftools/vcfcnv.c b/bcftools/vcfcnv.c
index e970b043..2e44a8e5 100644
--- a/bcftools/vcfcnv.c
+++ b/bcftools/vcfcnv.c
@@ -1,19 +1,19 @@
/* The MIT License
- Copyright (c) 2014-2022 Genome Research Ltd.
+ Copyright (c) 2014-2025 Genome Research Ltd.
Author: Petr Danecek
-
+
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
-
+
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
-
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
@@ -97,7 +97,7 @@ typedef struct _args_t
uint32_t *sites; // positions [nsites,msites]
int nsites, msites;
- double baum_welch_th, optimize_frac;
+ double baum_welch_th, optimize_frac;
float plot_th;
FILE *summary_fh;
char **argv, *regions_list, *summary_fname, *output_dir;
@@ -144,7 +144,7 @@ static double *init_tprob_matrix(int ndim, double ij_prob, double same_prob)
{
// interpret ij_prob differently, as ii_prob in fact, so that for two
// samples the behaviour is somewhat closer to single sample calling
- // with s=0.
+ // with s=0.
double pii = 1 - ij_prob*(N_STATES-1);
ij_prob = (1 - pii) / (ndim - 1);
for (j=0; jhdr)>1 ) error("Multi-sample VCF, missing the -s option\n");
args->query_sample.name = strdup(args->hdr->samples[0]);
}
- else
+ else
if ( bcf_hdr_id2int(args->hdr,BCF_DT_SAMPLE,args->query_sample.name)<0 ) error("The sample \"%s\" not found\n", args->query_sample.name);
if ( !args->files->readers[0].file->is_bin )
{
@@ -283,7 +283,7 @@ static void init_data(args_t *args)
}
else
args->summary_fh = NULL; // one sample only, no two-file summary
-
+
int i;
FILE *fh = args->summary_fh ? args->summary_fh : args->query_sample.summary_fh;
@@ -391,7 +391,7 @@ static void plot_sample(args_t *args, sample_t *smpl)
" plt.subplots_adjust(left=0.08,right=0.95,bottom=0.08,top=0.92)\n"
" plt.savefig('%s/plot.%s.chr'+chr+'.png')\n"
" plt.close()\n"
- "\n",
+ "\n",
smpl->dat_fname,smpl->cn_fname,smpl->name,args->output_dir,smpl->name
);
fclose(fp);
@@ -557,7 +557,7 @@ static void create_plots(args_t *args)
" plt.subplots_adjust(left=0.08,right=0.95,bottom=0.08,top=0.92,hspace=0)\n"
" plt.savefig('%s/plot.%s.%s.chr'+chr+'.png')\n"
" plt.close()\n"
- "\n",
+ "\n",
args->control_sample.name,args->query_sample.name,
args->output_dir,
args->control_sample.dat_fname,args->query_sample.dat_fname,
@@ -643,17 +643,17 @@ static int set_observed_prob(args_t *args, sample_t *smpl, int isite)
return 0;
}
- double cn1_baf =
+ double cn1_baf =
norm_prob(baf,GAUSS_CN1_PK_R(smpl)) * (fRR + fRA*0.5) +
norm_prob(baf,GAUSS_CN1_PK_A(smpl)) * (fAA + fRA*0.5) ;
- double cn2_baf =
- norm_prob(baf,GAUSS_CN2_PK_RR(smpl)) * fRR +
- norm_prob(baf,GAUSS_CN2_PK_RA(smpl)) * fRA +
+ double cn2_baf =
+ norm_prob(baf,GAUSS_CN2_PK_RR(smpl)) * fRR +
+ norm_prob(baf,GAUSS_CN2_PK_RA(smpl)) * fRA +
norm_prob(baf,GAUSS_CN2_PK_AA(smpl)) * fAA;
- double cn3_baf =
- norm_prob(baf,GAUSS_CN3_PK_RRR(smpl)) * fRR +
- norm_prob(baf,GAUSS_CN3_PK_RRA(smpl)) * fRA*0.5 +
- norm_prob(baf,GAUSS_CN3_PK_RAA(smpl)) * fRA*0.5 +
+ double cn3_baf =
+ norm_prob(baf,GAUSS_CN3_PK_RRR(smpl)) * fRR +
+ norm_prob(baf,GAUSS_CN3_PK_RRA(smpl)) * fRA*0.5 +
+ norm_prob(baf,GAUSS_CN3_PK_RAA(smpl)) * fRA*0.5 +
norm_prob(baf,GAUSS_CN3_PK_AAA(smpl)) * fAA;
double norm = cn1_baf + cn2_baf + cn3_baf;
@@ -1134,7 +1134,7 @@ static int parse_lrr_baf(sample_t *smpl, bcf_fmt_t *baf_fmt, bcf_fmt_t *lrr_fmt,
static void cnv_next_line(args_t *args, bcf1_t *line)
{
- if ( !line )
+ if ( !line )
{
// Done, flush viterbi
cnv_flush_viterbi(args);
@@ -1154,7 +1154,7 @@ static void cnv_next_line(args_t *args, bcf1_t *line)
args->ntot++;
bcf_fmt_t *baf_fmt, *lrr_fmt = NULL;
- if ( !(baf_fmt = bcf_get_fmt(args->hdr, line, "BAF")) ) return;
+ if ( !(baf_fmt = bcf_get_fmt(args->hdr, line, "BAF")) ) return;
if ( args->lrr_bias>0 && !(lrr_fmt = bcf_get_fmt(args->hdr, line, "LRR")) ) return;
float baf1,lrr1,baf2,lrr2;
@@ -1226,6 +1226,7 @@ static void usage(args_t *args)
fprintf(stderr, " -t, --targets REGION Similar to -r but streams rather than index-jumps\n");
fprintf(stderr, " -T, --targets-file FILE Similar to -R but streams rather than index-jumps\n");
fprintf(stderr, " --targets-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n");
+ fprintf(stderr, " -v, --verbosity INT Verbosity level\n");
fprintf(stderr, "HMM Options:\n");
fprintf(stderr, " -a, --aberrant FLOAT[,FLOAT] Fraction of aberrant cells in query and control [1.0,1.0]\n");
fprintf(stderr, " -b, --BAF-weight FLOAT Relative contribution from BAF [1]\n");
@@ -1271,7 +1272,7 @@ int main_vcfcnv(int argc, char *argv[])
int regions_overlap = 1;
int targets_overlap = 0;
- static struct option loptions[] =
+ static struct option loptions[] =
{
{"BAF-dev",1,0,'d'},
{"LRR-dev",1,0,'k'},
@@ -1295,17 +1296,21 @@ int main_vcfcnv(int argc, char *argv[])
{"regions-overlap",required_argument,NULL,3},
{"plot-threshold",1,0,'p'},
{"output-dir",1,0,'o'},
+ {"verbosity",required_argument,NULL,'v'},
{0,0,0,0}
};
char *tmp = NULL;
- while ((c = getopt_long(argc, argv, "h?r:R:t:T:s:o:p:l:T:c:b:P:x:e:O:W::f:a:L:d:k:",loptions,NULL)) >= 0) {
+ while ((c = getopt_long(argc, argv, "h?r:R:t:T:s:o:p:l:T:c:b:P:x:e:O:W::f:a:L:d:k:v:",loptions,NULL)) >= 0) {
switch (c) {
- case 'L':
+ case 'v':
+ if ( apply_verbosity(optarg) < 0 ) error("Could not parse argument: --verbosity %s\n", optarg);
+ break;
+ case 'L':
args->lrr_smooth_win = strtol(optarg,&tmp,10);
if ( *tmp ) error("Could not parse: --LRR-smooth-win %s\n", optarg);
break;
case 'f': args->af_fname = optarg; break;
- case 'O':
+ case 'O':
args->optimize_frac = strtod(optarg,&tmp);
if ( *tmp ) error("Could not parse: -O %s\n", optarg);
break;
@@ -1348,27 +1353,27 @@ int main_vcfcnv(int argc, char *argv[])
args->baum_welch_th = strtod(optarg,&tmp);
if ( *tmp ) error("Could not parse: -W %s\n", optarg);
break;
- case 'e':
+ case 'e':
args->err_prob = strtod(optarg,&tmp);
if ( *tmp ) error("Could not parse: -e %s\n", optarg);
break;
- case 'b':
+ case 'b':
args->baf_bias = strtod(optarg,&tmp);
if ( *tmp ) error("Could not parse: -b %s\n", optarg);
break;
- case 'x':
+ case 'x':
args->ij_prob = strtod(optarg,&tmp);
if ( *tmp ) error("Could not parse: -x %s\n", optarg);
break;
- case 'P':
+ case 'P':
args->same_prob = strtod(optarg,&tmp);
if ( *tmp ) error("Could not parse: -P %s\n", optarg);
break;
- case 'l':
+ case 'l':
args->lrr_bias = strtod(optarg,&tmp);
if ( *tmp ) error("Could not parse: -l %s\n", optarg);
break;
- case 'p':
+ case 'p':
args->plot_th = strtod(optarg,&tmp);
if ( *tmp ) error("Could not parse: -p %s\n", optarg);
break;
@@ -1387,7 +1392,7 @@ int main_vcfcnv(int argc, char *argv[])
targets_overlap = parse_overlap_option(optarg);
if ( targets_overlap < 0 ) error("Could not parse: --targets-overlap %s\n",optarg);
break;
- case 'h':
+ case 'h':
case '?': usage(args); break;
default: error("Unknown argument: %s\n", optarg);
}
@@ -1421,7 +1426,7 @@ int main_vcfcnv(int argc, char *argv[])
}
if ( !bcf_sr_add_reader(args->files, fname) )
error("Failed to read from %s: %s\n", !strcmp("-",fname)?"standard input":fname,bcf_sr_strerror(args->files->errnum));
-
+
init_data(args);
while ( bcf_sr_next_line(args->files) )
{
diff --git a/bcftools/vcfcnv.c.pysam.c b/bcftools/vcfcnv.c.pysam.c
index d6ee4212..031ba53a 100644
--- a/bcftools/vcfcnv.c.pysam.c
+++ b/bcftools/vcfcnv.c.pysam.c
@@ -2,20 +2,20 @@
/* The MIT License
- Copyright (c) 2014-2022 Genome Research Ltd.
+ Copyright (c) 2014-2025 Genome Research Ltd.
Author: Petr Danecek
-
+
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
-
+
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
-
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
@@ -99,7 +99,7 @@ typedef struct _args_t
uint32_t *sites; // positions [nsites,msites]
int nsites, msites;
- double baum_welch_th, optimize_frac;
+ double baum_welch_th, optimize_frac;
float plot_th;
FILE *summary_fh;
char **argv, *regions_list, *summary_fname, *output_dir;
@@ -146,7 +146,7 @@ static double *init_tprob_matrix(int ndim, double ij_prob, double same_prob)
{
// interpret ij_prob differently, as ii_prob in fact, so that for two
// samples the behaviour is somewhat closer to single sample calling
- // with s=0.
+ // with s=0.
double pii = 1 - ij_prob*(N_STATES-1);
ij_prob = (1 - pii) / (ndim - 1);
for (j=0; jhdr)>1 ) error("Multi-sample VCF, missing the -s option\n");
args->query_sample.name = strdup(args->hdr->samples[0]);
}
- else
+ else
if ( bcf_hdr_id2int(args->hdr,BCF_DT_SAMPLE,args->query_sample.name)<0 ) error("The sample \"%s\" not found\n", args->query_sample.name);
if ( !args->files->readers[0].file->is_bin )
{
@@ -285,7 +285,7 @@ static void init_data(args_t *args)
}
else
args->summary_fh = NULL; // one sample only, no two-file summary
-
+
int i;
FILE *fh = args->summary_fh ? args->summary_fh : args->query_sample.summary_fh;
@@ -393,7 +393,7 @@ static void plot_sample(args_t *args, sample_t *smpl)
" plt.subplots_adjust(left=0.08,right=0.95,bottom=0.08,top=0.92)\n"
" plt.savefig('%s/plot.%s.chr'+chr+'.png')\n"
" plt.close()\n"
- "\n",
+ "\n",
smpl->dat_fname,smpl->cn_fname,smpl->name,args->output_dir,smpl->name
);
fclose(fp);
@@ -559,7 +559,7 @@ static void create_plots(args_t *args)
" plt.subplots_adjust(left=0.08,right=0.95,bottom=0.08,top=0.92,hspace=0)\n"
" plt.savefig('%s/plot.%s.%s.chr'+chr+'.png')\n"
" plt.close()\n"
- "\n",
+ "\n",
args->control_sample.name,args->query_sample.name,
args->output_dir,
args->control_sample.dat_fname,args->query_sample.dat_fname,
@@ -645,17 +645,17 @@ static int set_observed_prob(args_t *args, sample_t *smpl, int isite)
return 0;
}
- double cn1_baf =
+ double cn1_baf =
norm_prob(baf,GAUSS_CN1_PK_R(smpl)) * (fRR + fRA*0.5) +
norm_prob(baf,GAUSS_CN1_PK_A(smpl)) * (fAA + fRA*0.5) ;
- double cn2_baf =
- norm_prob(baf,GAUSS_CN2_PK_RR(smpl)) * fRR +
- norm_prob(baf,GAUSS_CN2_PK_RA(smpl)) * fRA +
+ double cn2_baf =
+ norm_prob(baf,GAUSS_CN2_PK_RR(smpl)) * fRR +
+ norm_prob(baf,GAUSS_CN2_PK_RA(smpl)) * fRA +
norm_prob(baf,GAUSS_CN2_PK_AA(smpl)) * fAA;
- double cn3_baf =
- norm_prob(baf,GAUSS_CN3_PK_RRR(smpl)) * fRR +
- norm_prob(baf,GAUSS_CN3_PK_RRA(smpl)) * fRA*0.5 +
- norm_prob(baf,GAUSS_CN3_PK_RAA(smpl)) * fRA*0.5 +
+ double cn3_baf =
+ norm_prob(baf,GAUSS_CN3_PK_RRR(smpl)) * fRR +
+ norm_prob(baf,GAUSS_CN3_PK_RRA(smpl)) * fRA*0.5 +
+ norm_prob(baf,GAUSS_CN3_PK_RAA(smpl)) * fRA*0.5 +
norm_prob(baf,GAUSS_CN3_PK_AAA(smpl)) * fAA;
double norm = cn1_baf + cn2_baf + cn3_baf;
@@ -1136,7 +1136,7 @@ static int parse_lrr_baf(sample_t *smpl, bcf_fmt_t *baf_fmt, bcf_fmt_t *lrr_fmt,
static void cnv_next_line(args_t *args, bcf1_t *line)
{
- if ( !line )
+ if ( !line )
{
// Done, flush viterbi
cnv_flush_viterbi(args);
@@ -1156,7 +1156,7 @@ static void cnv_next_line(args_t *args, bcf1_t *line)
args->ntot++;
bcf_fmt_t *baf_fmt, *lrr_fmt = NULL;
- if ( !(baf_fmt = bcf_get_fmt(args->hdr, line, "BAF")) ) return;
+ if ( !(baf_fmt = bcf_get_fmt(args->hdr, line, "BAF")) ) return;
if ( args->lrr_bias>0 && !(lrr_fmt = bcf_get_fmt(args->hdr, line, "LRR")) ) return;
float baf1,lrr1,baf2,lrr2;
@@ -1228,6 +1228,7 @@ static void usage(args_t *args)
fprintf(bcftools_stderr, " -t, --targets REGION Similar to -r but streams rather than index-jumps\n");
fprintf(bcftools_stderr, " -T, --targets-file FILE Similar to -R but streams rather than index-jumps\n");
fprintf(bcftools_stderr, " --targets-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n");
+ fprintf(bcftools_stderr, " -v, --verbosity INT Verbosity level\n");
fprintf(bcftools_stderr, "HMM Options:\n");
fprintf(bcftools_stderr, " -a, --aberrant FLOAT[,FLOAT] Fraction of aberrant cells in query and control [1.0,1.0]\n");
fprintf(bcftools_stderr, " -b, --BAF-weight FLOAT Relative contribution from BAF [1]\n");
@@ -1273,7 +1274,7 @@ int main_vcfcnv(int argc, char *argv[])
int regions_overlap = 1;
int targets_overlap = 0;
- static struct option loptions[] =
+ static struct option loptions[] =
{
{"BAF-dev",1,0,'d'},
{"LRR-dev",1,0,'k'},
@@ -1297,17 +1298,21 @@ int main_vcfcnv(int argc, char *argv[])
{"regions-overlap",required_argument,NULL,3},
{"plot-threshold",1,0,'p'},
{"output-dir",1,0,'o'},
+ {"verbosity",required_argument,NULL,'v'},
{0,0,0,0}
};
char *tmp = NULL;
- while ((c = getopt_long(argc, argv, "h?r:R:t:T:s:o:p:l:T:c:b:P:x:e:O:W::f:a:L:d:k:",loptions,NULL)) >= 0) {
+ while ((c = getopt_long(argc, argv, "h?r:R:t:T:s:o:p:l:T:c:b:P:x:e:O:W::f:a:L:d:k:v:",loptions,NULL)) >= 0) {
switch (c) {
- case 'L':
+ case 'v':
+ if ( apply_verbosity(optarg) < 0 ) error("Could not parse argument: --verbosity %s\n", optarg);
+ break;
+ case 'L':
args->lrr_smooth_win = strtol(optarg,&tmp,10);
if ( *tmp ) error("Could not parse: --LRR-smooth-win %s\n", optarg);
break;
case 'f': args->af_fname = optarg; break;
- case 'O':
+ case 'O':
args->optimize_frac = strtod(optarg,&tmp);
if ( *tmp ) error("Could not parse: -O %s\n", optarg);
break;
@@ -1350,27 +1355,27 @@ int main_vcfcnv(int argc, char *argv[])
args->baum_welch_th = strtod(optarg,&tmp);
if ( *tmp ) error("Could not parse: -W %s\n", optarg);
break;
- case 'e':
+ case 'e':
args->err_prob = strtod(optarg,&tmp);
if ( *tmp ) error("Could not parse: -e %s\n", optarg);
break;
- case 'b':
+ case 'b':
args->baf_bias = strtod(optarg,&tmp);
if ( *tmp ) error("Could not parse: -b %s\n", optarg);
break;
- case 'x':
+ case 'x':
args->ij_prob = strtod(optarg,&tmp);
if ( *tmp ) error("Could not parse: -x %s\n", optarg);
break;
- case 'P':
+ case 'P':
args->same_prob = strtod(optarg,&tmp);
if ( *tmp ) error("Could not parse: -P %s\n", optarg);
break;
- case 'l':
+ case 'l':
args->lrr_bias = strtod(optarg,&tmp);
if ( *tmp ) error("Could not parse: -l %s\n", optarg);
break;
- case 'p':
+ case 'p':
args->plot_th = strtod(optarg,&tmp);
if ( *tmp ) error("Could not parse: -p %s\n", optarg);
break;
@@ -1389,7 +1394,7 @@ int main_vcfcnv(int argc, char *argv[])
targets_overlap = parse_overlap_option(optarg);
if ( targets_overlap < 0 ) error("Could not parse: --targets-overlap %s\n",optarg);
break;
- case 'h':
+ case 'h':
case '?': usage(args); break;
default: error("Unknown argument: %s\n", optarg);
}
@@ -1423,7 +1428,7 @@ int main_vcfcnv(int argc, char *argv[])
}
if ( !bcf_sr_add_reader(args->files, fname) )
error("Failed to read from %s: %s\n", !strcmp("-",fname)?"standard input":fname,bcf_sr_strerror(args->files->errnum));
-
+
init_data(args);
while ( bcf_sr_next_line(args->files) )
{
diff --git a/bcftools/vcfconcat.c b/bcftools/vcfconcat.c
index 232b3ae3..48afd20f 100644
--- a/bcftools/vcfconcat.c
+++ b/bcftools/vcfconcat.c
@@ -1,6 +1,6 @@
/* vcfconcat.c -- Concatenate or combine VCF/BCF files.
- Copyright (C) 2013-2023 Genome Research Ltd.
+ Copyright (C) 2013-2025 Genome Research Ltd.
Author: Petr Danecek
@@ -641,7 +641,7 @@ static void concat(args_t *args)
bcf_hdr_remove(hdr, BCF_HL_FMT, NULL);
bcf_hdr_destroy(hdr_ori);
}
- if ( !fp->is_bin && args->output_type&FT_VCF )
+ if ( !fp->is_bin && args->output_type&FT_VCF && !args->out_fh->idx)
{
line->max_unpack = BCF_UN_STR;
// if VCF is on both input and output, avoid VCF to BCF conversion
@@ -662,6 +662,7 @@ static void concat(args_t *args)
}
str++;
}
+ fp->line.l = str - fp->line.s;
str = fp->line.s;
}
while ( *str && *str!='\t' ) str++;
@@ -918,7 +919,7 @@ static void naive_concat(args_t *args)
// Output all non-header data that were read together with the header block
if ( fp->block_length - nskip > 0 )
{
- if ( bgzf_write(bgzf_out, (char *)fp->uncompressed_block+nskip, fp->block_length-nskip)<0 ) error("\nError: %d\n",fp->errcode);
+ if ( bgzf_write(bgzf_out, (char *)fp->uncompressed_block+nskip, fp->block_length-nskip)<0 ) error("\nError: %d\n",bgzf_out->errcode);
}
if ( bgzf_flush(bgzf_out)<0 ) error("\nError: %d\n",bgzf_out->errcode);
@@ -951,7 +952,7 @@ static void naive_concat(args_t *args)
}
free(buf);
free(tmp.s);
- if (bgzf_close(bgzf_out) < 0) error("Error: %d\n",bgzf_out->errcode);
+ if (bgzf_close(bgzf_out) < 0) error("Error: %s\n",strerror(errno));
}
static void usage(args_t *args)
@@ -986,7 +987,7 @@ static void usage(args_t *args)
fprintf(stderr, " -R, --regions-file FILE Restrict to regions listed in a file\n");
fprintf(stderr, " --regions-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1]\n");
fprintf(stderr, " --threads INT Use multithreading with worker threads [0]\n");
- fprintf(stderr, " -v, --verbose 0|1 Set verbosity level [1]\n");
+ fprintf(stderr, " -v, --verbosity INT Set verbosity level\n");
fprintf(stderr, " -W, --write-index[=FMT] Automatically index the output files [off]\n");
fprintf(stderr, "\n");
exit(1);
@@ -1008,6 +1009,7 @@ int main_vcfconcat(int argc, char *argv[])
static struct option loptions[] =
{
{"verbose",required_argument,NULL,'v'},
+ {"verbosity",required_argument,NULL,'v'},
{"naive",no_argument,NULL,'n'},
{"naive-force",no_argument,NULL,7},
{"compact-PS",no_argument,NULL,'c'},
@@ -1080,6 +1082,7 @@ int main_vcfconcat(int argc, char *argv[])
case 'v':
args->verbose = strtol(optarg, &tmp, 0);
if ( *tmp || args->verbose<0 || args->verbose>1 ) error("Error: currently only --verbose 0 or --verbose 1 is supported\n");
+ if ( args->verbose > 3 ) hts_verbose = args->verbose;
break;
case 'W':
if (!(args->write_index = write_index_parse(optarg)))
diff --git a/bcftools/vcfconcat.c.pysam.c b/bcftools/vcfconcat.c.pysam.c
index d238dc04..6e4b47f3 100644
--- a/bcftools/vcfconcat.c.pysam.c
+++ b/bcftools/vcfconcat.c.pysam.c
@@ -2,7 +2,7 @@
/* vcfconcat.c -- Concatenate or combine VCF/BCF files.
- Copyright (C) 2013-2023 Genome Research Ltd.
+ Copyright (C) 2013-2025 Genome Research Ltd.
Author: Petr Danecek
@@ -643,7 +643,7 @@ static void concat(args_t *args)
bcf_hdr_remove(hdr, BCF_HL_FMT, NULL);
bcf_hdr_destroy(hdr_ori);
}
- if ( !fp->is_bin && args->output_type&FT_VCF )
+ if ( !fp->is_bin && args->output_type&FT_VCF && !args->out_fh->idx)
{
line->max_unpack = BCF_UN_STR;
// if VCF is on both input and output, avoid VCF to BCF conversion
@@ -664,6 +664,7 @@ static void concat(args_t *args)
}
str++;
}
+ fp->line.l = str - fp->line.s;
str = fp->line.s;
}
while ( *str && *str!='\t' ) str++;
@@ -920,7 +921,7 @@ static void naive_concat(args_t *args)
// Output all non-header data that were read together with the header block
if ( fp->block_length - nskip > 0 )
{
- if ( bgzf_write(bgzf_out, (char *)fp->uncompressed_block+nskip, fp->block_length-nskip)<0 ) error("\nError: %d\n",fp->errcode);
+ if ( bgzf_write(bgzf_out, (char *)fp->uncompressed_block+nskip, fp->block_length-nskip)<0 ) error("\nError: %d\n",bgzf_out->errcode);
}
if ( bgzf_flush(bgzf_out)<0 ) error("\nError: %d\n",bgzf_out->errcode);
@@ -953,7 +954,7 @@ static void naive_concat(args_t *args)
}
free(buf);
free(tmp.s);
- if (bgzf_close(bgzf_out) < 0) error("Error: %d\n",bgzf_out->errcode);
+ if (bgzf_close(bgzf_out) < 0) error("Error: %s\n",strerror(errno));
}
static void usage(args_t *args)
@@ -988,7 +989,7 @@ static void usage(args_t *args)
fprintf(bcftools_stderr, " -R, --regions-file FILE Restrict to regions listed in a file\n");
fprintf(bcftools_stderr, " --regions-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1]\n");
fprintf(bcftools_stderr, " --threads INT Use multithreading with worker threads [0]\n");
- fprintf(bcftools_stderr, " -v, --verbose 0|1 Set verbosity level [1]\n");
+ fprintf(bcftools_stderr, " -v, --verbosity INT Set verbosity level\n");
fprintf(bcftools_stderr, " -W, --write-index[=FMT] Automatically index the output files [off]\n");
fprintf(bcftools_stderr, "\n");
bcftools_exit(1);
@@ -1010,6 +1011,7 @@ int main_vcfconcat(int argc, char *argv[])
static struct option loptions[] =
{
{"verbose",required_argument,NULL,'v'},
+ {"verbosity",required_argument,NULL,'v'},
{"naive",no_argument,NULL,'n'},
{"naive-force",no_argument,NULL,7},
{"compact-PS",no_argument,NULL,'c'},
@@ -1082,6 +1084,7 @@ int main_vcfconcat(int argc, char *argv[])
case 'v':
args->verbose = strtol(optarg, &tmp, 0);
if ( *tmp || args->verbose<0 || args->verbose>1 ) error("Error: currently only --verbose 0 or --verbose 1 is supported\n");
+ if ( args->verbose > 3 ) hts_verbose = args->verbose;
break;
case 'W':
if (!(args->write_index = write_index_parse(optarg)))
diff --git a/bcftools/vcfconvert.c b/bcftools/vcfconvert.c
index f75085aa..b01742ae 100644
--- a/bcftools/vcfconvert.c
+++ b/bcftools/vcfconvert.c
@@ -1,6 +1,6 @@
/* vcfconvert.c -- convert between VCF/BCF and related formats.
- Copyright (C) 2013-2023 Genome Research Ltd.
+ Copyright (C) 2013-2025 Genome Research Ltd.
Author: Petr Danecek
@@ -1583,6 +1583,7 @@ static void gvcf_to_vcf(args_t *args)
char *ref = faidx_fetch_seq(args->ref, (char*)bcf_hdr_id2name(hdr,line->rid), line->pos, line->pos, &len);
if ( !ref ) error("faidx_fetch_seq failed at %s:%"PRId64"\n", bcf_hdr_id2name(hdr,line->rid),(int64_t) line->pos+1);
strncpy(line->d.allele[0],ref,len);
+ bcf_update_alleles(hdr,line,(const char**)line->d.allele,line->n_allele);
if ( bcf_write(out_fh,hdr,line)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname);
free(ref);
}
@@ -1620,11 +1621,12 @@ static void usage(void)
fprintf(stderr, " -T, --targets-file FILE Similar to -R but streams rather than index-jumps\n");
fprintf(stderr, " --targets-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n");
fprintf(stderr, "\n");
- fprintf(stderr, "VCF output options:\n");
+ fprintf(stderr, "General options:\n");
fprintf(stderr, " --no-version Do not append version and command line to the header\n");
fprintf(stderr, " -o, --output FILE Output file name [stdout]\n");
fprintf(stderr, " -O, --output-type u|b|v|z[0-9] u/b: un/compressed BCF, v/z: un/compressed VCF, 0-9: compression level [v]\n");
fprintf(stderr, " --threads INT Use multithreading with INT worker threads [0]\n");
+ fprintf(stderr, " -v, --verbosity INT Verbosity level\n");
fprintf(stderr, " -W, --write-index[=FMT] Automatically index the output files [off]\n");
fprintf(stderr, "\n");
fprintf(stderr, "GEN/SAMPLE conversion (input/output from IMPUTE2):\n");
@@ -1719,12 +1721,16 @@ int main_vcfconvert(int argc, char *argv[])
{"fasta-ref",required_argument,NULL,'f'},
{"no-version",no_argument,NULL,10},
{"keep-duplicates",no_argument,NULL,12},
+ {"verbosity",required_argument,NULL,'v'},
{"write-index",optional_argument,NULL,'W'},
{NULL,0,NULL,0}
};
char *tmp;
- while ((c = getopt_long(argc, argv, "?h:r:R:s:S:t:T:i:e:g:G:o:O:c:f:H:W::",loptions,NULL)) >= 0) {
+ while ((c = getopt_long(argc, argv, "?h:r:R:s:S:t:T:i:e:g:G:o:O:c:f:H:W::v:",loptions,NULL)) >= 0) {
switch (c) {
+ case 'v':
+ if ( apply_verbosity(optarg) < 0 ) error("Could not parse argument: --verbosity %s\n", optarg);
+ break;
case 'e':
if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break;
diff --git a/bcftools/vcfconvert.c.pysam.c b/bcftools/vcfconvert.c.pysam.c
index f8921bf6..07f7961c 100644
--- a/bcftools/vcfconvert.c.pysam.c
+++ b/bcftools/vcfconvert.c.pysam.c
@@ -2,7 +2,7 @@
/* vcfconvert.c -- convert between VCF/BCF and related formats.
- Copyright (C) 2013-2023 Genome Research Ltd.
+ Copyright (C) 2013-2025 Genome Research Ltd.
Author: Petr Danecek
@@ -1585,6 +1585,7 @@ static void gvcf_to_vcf(args_t *args)
char *ref = faidx_fetch_seq(args->ref, (char*)bcf_hdr_id2name(hdr,line->rid), line->pos, line->pos, &len);
if ( !ref ) error("faidx_fetch_seq failed at %s:%"PRId64"\n", bcf_hdr_id2name(hdr,line->rid),(int64_t) line->pos+1);
strncpy(line->d.allele[0],ref,len);
+ bcf_update_alleles(hdr,line,(const char**)line->d.allele,line->n_allele);
if ( bcf_write(out_fh,hdr,line)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname);
free(ref);
}
@@ -1622,11 +1623,12 @@ static void usage(void)
fprintf(bcftools_stderr, " -T, --targets-file FILE Similar to -R but streams rather than index-jumps\n");
fprintf(bcftools_stderr, " --targets-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n");
fprintf(bcftools_stderr, "\n");
- fprintf(bcftools_stderr, "VCF output options:\n");
+ fprintf(bcftools_stderr, "General options:\n");
fprintf(bcftools_stderr, " --no-version Do not append version and command line to the header\n");
fprintf(bcftools_stderr, " -o, --output FILE Output file name [bcftools_stdout]\n");
fprintf(bcftools_stderr, " -O, --output-type u|b|v|z[0-9] u/b: un/compressed BCF, v/z: un/compressed VCF, 0-9: compression level [v]\n");
fprintf(bcftools_stderr, " --threads INT Use multithreading with INT worker threads [0]\n");
+ fprintf(bcftools_stderr, " -v, --verbosity INT Verbosity level\n");
fprintf(bcftools_stderr, " -W, --write-index[=FMT] Automatically index the output files [off]\n");
fprintf(bcftools_stderr, "\n");
fprintf(bcftools_stderr, "GEN/SAMPLE conversion (input/output from IMPUTE2):\n");
@@ -1721,12 +1723,16 @@ int main_vcfconvert(int argc, char *argv[])
{"fasta-ref",required_argument,NULL,'f'},
{"no-version",no_argument,NULL,10},
{"keep-duplicates",no_argument,NULL,12},
+ {"verbosity",required_argument,NULL,'v'},
{"write-index",optional_argument,NULL,'W'},
{NULL,0,NULL,0}
};
char *tmp;
- while ((c = getopt_long(argc, argv, "?h:r:R:s:S:t:T:i:e:g:G:o:O:c:f:H:W::",loptions,NULL)) >= 0) {
+ while ((c = getopt_long(argc, argv, "?h:r:R:s:S:t:T:i:e:g:G:o:O:c:f:H:W::v:",loptions,NULL)) >= 0) {
switch (c) {
+ case 'v':
+ if ( apply_verbosity(optarg) < 0 ) error("Could not parse argument: --verbosity %s\n", optarg);
+ break;
case 'e':
if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break;
diff --git a/bcftools/vcffilter.c b/bcftools/vcffilter.c
index 52d4f945..52d85ea7 100644
--- a/bcftools/vcffilter.c
+++ b/bcftools/vcffilter.c
@@ -1,6 +1,6 @@
/* vcffilter.c -- Apply fixed-threshold filters.
- Copyright (C) 2013-2023 Genome Research Ltd.
+ Copyright (C) 2013-2025 Genome Research Ltd.
Author: Petr Danecek
@@ -493,6 +493,7 @@ static void usage(args_t *args)
fprintf(stderr, " -T, --targets-file FILE Similar to -R but streams rather than index-jumps\n");
fprintf(stderr, " --targets-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n");
fprintf(stderr, " --threads INT Use multithreading with worker threads [0]\n");
+ fprintf(stderr, " -v, --verbosity INT Verbosity level\n");
fprintf(stderr, " -W, --write-index[=FMT] Automatically index the output files [off]\n");
fprintf(stderr, "\n");
exit(1);
@@ -537,11 +538,15 @@ int main_vcffilter(int argc, char *argv[])
{"IndelGap",required_argument,NULL,'G'},
{"no-version",no_argument,NULL,8},
{"write-index",optional_argument,NULL,'W'},
+ {"verbosity",required_argument,NULL,'v'},
{NULL,0,NULL,0}
};
char *tmp;
- while ((c = getopt_long(argc, argv, "e:i:t:T:r:R:h?s:m:M:o:O:g:G:S:W::",loptions,NULL)) >= 0) {
+ while ((c = getopt_long(argc, argv, "e:i:t:T:r:R:h?s:m:M:o:O:g:G:S:W::v:",loptions,NULL)) >= 0) {
switch (c) {
+ case 'v':
+ if ( apply_verbosity(optarg) < 0 ) error("Could not parse argument: --verbosity %s\n", optarg);
+ break;
case 'g':
args->snp_gap = strtol(optarg,&tmp,10);
if ( *tmp && *tmp!=':' ) error("Could not parse argument: --SnpGap %s\n", optarg);
diff --git a/bcftools/vcffilter.c.pysam.c b/bcftools/vcffilter.c.pysam.c
index c240f799..b42a05fb 100644
--- a/bcftools/vcffilter.c.pysam.c
+++ b/bcftools/vcffilter.c.pysam.c
@@ -2,7 +2,7 @@
/* vcffilter.c -- Apply fixed-threshold filters.
- Copyright (C) 2013-2023 Genome Research Ltd.
+ Copyright (C) 2013-2025 Genome Research Ltd.
Author: Petr Danecek
@@ -495,6 +495,7 @@ static void usage(args_t *args)
fprintf(bcftools_stderr, " -T, --targets-file FILE Similar to -R but streams rather than index-jumps\n");
fprintf(bcftools_stderr, " --targets-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n");
fprintf(bcftools_stderr, " --threads INT Use multithreading with worker threads [0]\n");
+ fprintf(bcftools_stderr, " -v, --verbosity INT Verbosity level\n");
fprintf(bcftools_stderr, " -W, --write-index[=FMT] Automatically index the output files [off]\n");
fprintf(bcftools_stderr, "\n");
bcftools_exit(1);
@@ -539,11 +540,15 @@ int main_vcffilter(int argc, char *argv[])
{"IndelGap",required_argument,NULL,'G'},
{"no-version",no_argument,NULL,8},
{"write-index",optional_argument,NULL,'W'},
+ {"verbosity",required_argument,NULL,'v'},
{NULL,0,NULL,0}
};
char *tmp;
- while ((c = getopt_long(argc, argv, "e:i:t:T:r:R:h?s:m:M:o:O:g:G:S:W::",loptions,NULL)) >= 0) {
+ while ((c = getopt_long(argc, argv, "e:i:t:T:r:R:h?s:m:M:o:O:g:G:S:W::v:",loptions,NULL)) >= 0) {
switch (c) {
+ case 'v':
+ if ( apply_verbosity(optarg) < 0 ) error("Could not parse argument: --verbosity %s\n", optarg);
+ break;
case 'g':
args->snp_gap = strtol(optarg,&tmp,10);
if ( *tmp && *tmp!=':' ) error("Could not parse argument: --SnpGap %s\n", optarg);
diff --git a/bcftools/vcfgtcheck.c b/bcftools/vcfgtcheck.c
index be886db3..6f7db004 100644
--- a/bcftools/vcfgtcheck.c
+++ b/bcftools/vcfgtcheck.c
@@ -1,6 +1,6 @@
/* vcfgtcheck.c -- Check sample identity.
- Copyright (C) 2013-2024 Genome Research Ltd.
+ Copyright (C) 2013-2025 Genome Research Ltd.
Author: Petr Danecek
@@ -1176,6 +1176,7 @@ static void usage(void)
fprintf(stderr, " -T, --targets-file FILE Similar to -R but streams rather than index-jumps\n");
fprintf(stderr, " --targets-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n");
fprintf(stderr, " -u, --use TAG1[,TAG2] Which tag to use in the query file (TAG1) and the -g file (TAG2) [PL,GT]\n");
+ fprintf(stderr, " -v, --verbosity INT Verbosity level\n");
fprintf(stderr, "Examples:\n");
fprintf(stderr, " # Check discordance of all samples from B against all samples in A\n");
fprintf(stderr, " bcftools gtcheck -g A.bcf B.bcf\n");
@@ -1247,11 +1248,15 @@ int main_vcfgtcheck(int argc, char *argv[])
{"targets-overlap",required_argument,NULL,8},
{"pairs",1,0,'p'},
{"pairs-file",1,0,'P'},
+ {"verbosity",required_argument,NULL,'v'},
{0,0,0,0}
};
char *tmp;
- while ((c = getopt_long(argc, argv, "hg:p:s:S:p:P:Hr:R:at:T:G:c:u:e:E:i:o:O:",loptions,NULL)) >= 0) {
+ while ((c = getopt_long(argc, argv, "hg:p:s:S:p:P:Hr:R:at:T:G:c:u:e:E:i:o:O:v:",loptions,NULL)) >= 0) {
switch (c) {
+ case 'v':
+ if ( apply_verbosity(optarg) < 0 ) error("Could not parse argument: --verbosity %s\n", optarg);
+ break;
case 'o': args->output_fname = optarg; break;
case 'O':
switch (optarg[0]) {
diff --git a/bcftools/vcfgtcheck.c.pysam.c b/bcftools/vcfgtcheck.c.pysam.c
index de7c6162..d5c9f2fa 100644
--- a/bcftools/vcfgtcheck.c.pysam.c
+++ b/bcftools/vcfgtcheck.c.pysam.c
@@ -2,7 +2,7 @@
/* vcfgtcheck.c -- Check sample identity.
- Copyright (C) 2013-2024 Genome Research Ltd.
+ Copyright (C) 2013-2025 Genome Research Ltd.
Author: Petr Danecek
@@ -1178,6 +1178,7 @@ static void usage(void)
fprintf(bcftools_stderr, " -T, --targets-file FILE Similar to -R but streams rather than index-jumps\n");
fprintf(bcftools_stderr, " --targets-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n");
fprintf(bcftools_stderr, " -u, --use TAG1[,TAG2] Which tag to use in the query file (TAG1) and the -g file (TAG2) [PL,GT]\n");
+ fprintf(bcftools_stderr, " -v, --verbosity INT Verbosity level\n");
fprintf(bcftools_stderr, "Examples:\n");
fprintf(bcftools_stderr, " # Check discordance of all samples from B against all samples in A\n");
fprintf(bcftools_stderr, " bcftools gtcheck -g A.bcf B.bcf\n");
@@ -1249,11 +1250,15 @@ int main_vcfgtcheck(int argc, char *argv[])
{"targets-overlap",required_argument,NULL,8},
{"pairs",1,0,'p'},
{"pairs-file",1,0,'P'},
+ {"verbosity",required_argument,NULL,'v'},
{0,0,0,0}
};
char *tmp;
- while ((c = getopt_long(argc, argv, "hg:p:s:S:p:P:Hr:R:at:T:G:c:u:e:E:i:o:O:",loptions,NULL)) >= 0) {
+ while ((c = getopt_long(argc, argv, "hg:p:s:S:p:P:Hr:R:at:T:G:c:u:e:E:i:o:O:v:",loptions,NULL)) >= 0) {
switch (c) {
+ case 'v':
+ if ( apply_verbosity(optarg) < 0 ) error("Could not parse argument: --verbosity %s\n", optarg);
+ break;
case 'o': args->output_fname = optarg; break;
case 'O':
switch (optarg[0]) {
diff --git a/bcftools/vcfhead.c b/bcftools/vcfhead.c
index 0b0222b5..a1815744 100644
--- a/bcftools/vcfhead.c
+++ b/bcftools/vcfhead.c
@@ -1,7 +1,7 @@
/* vcfhead.c -- view VCF/BCF file headers.
Copyright (C) 2021 University of Glasgow.
- Copyright (C) 2023 Genome Research Ltd.
+ Copyright (C) 2023-2025 Genome Research Ltd.
Author: John Marshall
@@ -42,15 +42,17 @@ int main_vcfhead(int argc, char *argv[])
"Usage: bcftools head [OPTION]... [FILE]\n"
"\n"
"Options:\n"
-" -h, --headers INT Display INT header lines [all]\n"
-" -n, --records INT Display INT variant record lines [none]\n"
-" -s, --samples INT Display INT records starting with the #CHROM header line [none]\n"
+" -h, --headers INT Display INT header lines [all]\n"
+" -n, --records INT Display INT variant record lines [none]\n"
+" -s, --samples INT Display INT records starting with the #CHROM header line [none]\n"
+" -v, --verbosity INT Verbosity level\n"
"\n";
static const struct option loptions[] = {
{ "headers", required_argument, NULL, 'h' },
{ "records", required_argument, NULL, 'n' },
{ "samples", required_argument, NULL, 's' },
+ { "verbosity", required_argument, NULL, 'v' },
{ NULL, 0, NULL, 0 }
};
@@ -60,8 +62,11 @@ int main_vcfhead(int argc, char *argv[])
uint64_t nrecords = 0;
int c, nargs;
- while ((c = getopt_long(argc, argv, "h:n:s:", loptions, NULL)) >= 0)
+ while ((c = getopt_long(argc, argv, "h:n:s:v:", loptions, NULL)) >= 0)
switch (c) {
+ case 'v':
+ if ( apply_verbosity(optarg) < 0 ) error("Could not parse argument: --verbosity %s\n", optarg);
+ break;
case 'h': all_headers = 0; nheaders = strtoull(optarg, NULL, 0); break;
case 'n': nrecords = strtoull(optarg, NULL, 0); break;
case 's': nrecords = strtoull(optarg, NULL, 0); samples = 1; break;
diff --git a/bcftools/vcfhead.c.pysam.c b/bcftools/vcfhead.c.pysam.c
index 832c9bd7..e751a614 100644
--- a/bcftools/vcfhead.c.pysam.c
+++ b/bcftools/vcfhead.c.pysam.c
@@ -3,7 +3,7 @@
/* vcfhead.c -- view VCF/BCF file headers.
Copyright (C) 2021 University of Glasgow.
- Copyright (C) 2023 Genome Research Ltd.
+ Copyright (C) 2023-2025 Genome Research Ltd.
Author: John Marshall
@@ -44,15 +44,17 @@ int main_vcfhead(int argc, char *argv[])
"Usage: bcftools head [OPTION]... [FILE]\n"
"\n"
"Options:\n"
-" -h, --headers INT Display INT header lines [all]\n"
-" -n, --records INT Display INT variant record lines [none]\n"
-" -s, --samples INT Display INT records starting with the #CHROM header line [none]\n"
+" -h, --headers INT Display INT header lines [all]\n"
+" -n, --records INT Display INT variant record lines [none]\n"
+" -s, --samples INT Display INT records starting with the #CHROM header line [none]\n"
+" -v, --verbosity INT Verbosity level\n"
"\n";
static const struct option loptions[] = {
{ "headers", required_argument, NULL, 'h' },
{ "records", required_argument, NULL, 'n' },
{ "samples", required_argument, NULL, 's' },
+ { "verbosity", required_argument, NULL, 'v' },
{ NULL, 0, NULL, 0 }
};
@@ -62,8 +64,11 @@ int main_vcfhead(int argc, char *argv[])
uint64_t nrecords = 0;
int c, nargs;
- while ((c = getopt_long(argc, argv, "h:n:s:", loptions, NULL)) >= 0)
+ while ((c = getopt_long(argc, argv, "h:n:s:v:", loptions, NULL)) >= 0)
switch (c) {
+ case 'v':
+ if ( apply_verbosity(optarg) < 0 ) error("Could not parse argument: --verbosity %s\n", optarg);
+ break;
case 'h': all_headers = 0; nheaders = strtoull(optarg, NULL, 0); break;
case 'n': nrecords = strtoull(optarg, NULL, 0); break;
case 's': nrecords = strtoull(optarg, NULL, 0); samples = 1; break;
diff --git a/bcftools/vcfindex.c b/bcftools/vcfindex.c
index 17eac5f3..d6029832 100644
--- a/bcftools/vcfindex.c
+++ b/bcftools/vcfindex.c
@@ -1,6 +1,6 @@
/* vcfindex.c -- Index bgzip compressed VCF/BCF files for random access.
- Copyright (C) 2014-2024 Genome Research Ltd.
+ Copyright (C) 2014-2025 Genome Research Ltd.
Author: Shane McCarthy
@@ -57,6 +57,7 @@ static void usage(void)
fprintf(stderr, " -o, --output FILE optional output index file name\n");
fprintf(stderr, " -t, --tbi generate TBI-format index for VCF files\n");
fprintf(stderr, " --threads INT use multithreading with INT worker threads [0]\n");
+ fprintf(stderr, " -v, --verbosity INT verbosity level\n");
fprintf(stderr, "\n");
fprintf(stderr, "Stats options:\n");
fprintf(stderr, " -a, --all with --stats, print stats for all contigs even when zero\n");
@@ -236,16 +237,20 @@ int main_vcfindex(int argc, char *argv[])
{"stats",no_argument,NULL,'s'},
{"nrecords",no_argument,NULL,'n'},
{"threads",required_argument,NULL,9},
+ {"verbosity",required_argument,NULL,'v'},
{"output-file",required_argument,NULL,'o'},
{"output",required_argument,NULL,'o'},
{NULL, 0, NULL, 0}
};
char *tmp;
- while ((c = getopt_long(argc, argv, "ctfm:snao:", loptions, NULL)) >= 0)
+ while ((c = getopt_long(argc, argv, "ctfm:snao:v:", loptions, NULL)) >= 0)
{
switch (c)
{
+ case 'v':
+ if ( apply_verbosity(optarg) < 0 ) error("Could not parse argument: --verbosity %s\n", optarg);
+ break;
case 'c': tbi = 0; break;
case 't': tbi = 1; min_shift = 0; break;
case 'f': force = 1; break;
diff --git a/bcftools/vcfindex.c.pysam.c b/bcftools/vcfindex.c.pysam.c
index 8f6932f1..e103cac0 100644
--- a/bcftools/vcfindex.c.pysam.c
+++ b/bcftools/vcfindex.c.pysam.c
@@ -2,7 +2,7 @@
/* vcfindex.c -- Index bgzip compressed VCF/BCF files for random access.
- Copyright (C) 2014-2024 Genome Research Ltd.
+ Copyright (C) 2014-2025 Genome Research Ltd.
Author: Shane McCarthy
@@ -59,6 +59,7 @@ static void usage(void)
fprintf(bcftools_stderr, " -o, --output FILE optional output index file name\n");
fprintf(bcftools_stderr, " -t, --tbi generate TBI-format index for VCF files\n");
fprintf(bcftools_stderr, " --threads INT use multithreading with INT worker threads [0]\n");
+ fprintf(bcftools_stderr, " -v, --verbosity INT verbosity level\n");
fprintf(bcftools_stderr, "\n");
fprintf(bcftools_stderr, "Stats options:\n");
fprintf(bcftools_stderr, " -a, --all with --stats, print stats for all contigs even when zero\n");
@@ -238,16 +239,20 @@ int main_vcfindex(int argc, char *argv[])
{"stats",no_argument,NULL,'s'},
{"nrecords",no_argument,NULL,'n'},
{"threads",required_argument,NULL,9},
+ {"verbosity",required_argument,NULL,'v'},
{"output-file",required_argument,NULL,'o'},
{"output",required_argument,NULL,'o'},
{NULL, 0, NULL, 0}
};
char *tmp;
- while ((c = getopt_long(argc, argv, "ctfm:snao:", loptions, NULL)) >= 0)
+ while ((c = getopt_long(argc, argv, "ctfm:snao:v:", loptions, NULL)) >= 0)
{
switch (c)
{
+ case 'v':
+ if ( apply_verbosity(optarg) < 0 ) error("Could not parse argument: --verbosity %s\n", optarg);
+ break;
case 'c': tbi = 0; break;
case 't': tbi = 1; min_shift = 0; break;
case 'f': force = 1; break;
diff --git a/bcftools/vcfisec.c b/bcftools/vcfisec.c
index 24a45685..51750c17 100644
--- a/bcftools/vcfisec.c
+++ b/bcftools/vcfisec.c
@@ -1,6 +1,6 @@
/* vcfisec.c -- Create intersections, unions and complements of VCF files.
- Copyright (C) 2012-2023 Genome Research Ltd.
+ Copyright (C) 2012-2025 Genome Research Ltd.
Author: Petr Danecek
@@ -460,7 +460,7 @@ static void destroy_data(args_t *args)
{
if ( !args->fnames[i] ) continue;
if ( hts_close(args->fh_out[i])!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->fnames[i]);
- int is_tbi = !args->write_index
+ int is_tbi = !args->write_index
|| (args->write_index&127) == HTS_FMT_TBI;
if ( args->output_type==FT_VCF_GZ && is_tbi )
{
@@ -476,8 +476,8 @@ static void destroy_data(args_t *args)
free(args->fh_out);
free(args->fnames);
if ( args->fh_sites ) fclose(args->fh_sites);
- if ( args->write ) free(args->write);
}
+ free(args->write);
}
static void usage(void)
@@ -487,7 +487,7 @@ static void usage(void)
fprintf(stderr, "Usage: bcftools isec [options] [...]\n");
fprintf(stderr, "\n");
fprintf(stderr, "Options:\n");
- fprintf(stderr, " -c, --collapse STRING Treat as identical records with , see man page for details [none]\n");
+ fprintf(stderr, " -c, --collapse STRING Treat as identical records with , see man page for details [none]\n");
fprintf(stderr, " -C, --complement Output positions present only in the first file but missing in the others\n");
fprintf(stderr, " -e, --exclude EXPR Exclude sites for which the expression is true\n");
fprintf(stderr, " -f, --apply-filters LIST Require at least one of the listed FILTER strings (e.g. \"PASS,.\")\n");
@@ -504,7 +504,8 @@ static void usage(void)
fprintf(stderr, " -t, --targets REGION Similar to -r but streams rather than index-jumps\n");
fprintf(stderr, " -T, --targets-file FILE Similar to -R but streams rather than index-jumps\n");
fprintf(stderr, " --targets-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n");
- fprintf(stderr, " --threads INT Use multithreading with worker threads [0]\n");
+ fprintf(stderr, " --threads INT Use multithreading with INT worker threads [0]\n");
+ fprintf(stderr, " -v, --verbosity INT Verbosity level\n");
fprintf(stderr, " -w, --write LIST List of files to write with -p given as 1-based indexes. By default, all files are written\n");
fprintf(stderr, " -W, --write-index[=FMT] Automatically index the output files [off]\n");
fprintf(stderr, "\n");
@@ -565,11 +566,15 @@ int main_vcfisec(int argc, char *argv[])
{"threads",required_argument,NULL,9},
{"no-version",no_argument,NULL,8},
{"write-index",optional_argument,NULL,'W'},
+ {"verbosity",required_argument,NULL,'v'},
{NULL,0,NULL,0}
};
char *tmp;
- while ((c = getopt_long(argc, argv, "hc:r:R:p:n:w:t:T:Cf:o:O:i:e:l:W::",loptions,NULL)) >= 0) {
+ while ((c = getopt_long(argc, argv, "hc:r:R:p:n:w:t:T:Cf:o:O:i:e:l:W::v:",loptions,NULL)) >= 0) {
switch (c) {
+ case 'v':
+ if ( apply_verbosity(optarg) < 0 ) error("Could not parse argument: --verbosity %s\n", optarg);
+ break;
case 'o': args->output_fname = optarg; break;
case 'O':
switch (optarg[0]) {
@@ -597,6 +602,7 @@ int main_vcfisec(int argc, char *argv[])
else if ( !strcmp(optarg,"all") ) args->files->collapse |= COLLAPSE_ANY;
else if ( !strcmp(optarg,"some") ) args->files->collapse |= COLLAPSE_SOME;
else if ( !strcmp(optarg,"none") ) args->files->collapse = COLLAPSE_NONE;
+ else if ( !strcmp(optarg,"id") ) args->files->collapse |= BCF_SR_PAIR_ID;
else error("The --collapse string \"%s\" not recognised.\n", optarg);
break;
case 'f': args->files->apply_filters = optarg; break;
diff --git a/bcftools/vcfisec.c.pysam.c b/bcftools/vcfisec.c.pysam.c
index f4727c1d..f3c9e02d 100644
--- a/bcftools/vcfisec.c.pysam.c
+++ b/bcftools/vcfisec.c.pysam.c
@@ -2,7 +2,7 @@
/* vcfisec.c -- Create intersections, unions and complements of VCF files.
- Copyright (C) 2012-2023 Genome Research Ltd.
+ Copyright (C) 2012-2025 Genome Research Ltd.
Author: Petr Danecek
@@ -462,7 +462,7 @@ static void destroy_data(args_t *args)
{
if ( !args->fnames[i] ) continue;
if ( hts_close(args->fh_out[i])!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->fnames[i]);
- int is_tbi = !args->write_index
+ int is_tbi = !args->write_index
|| (args->write_index&127) == HTS_FMT_TBI;
if ( args->output_type==FT_VCF_GZ && is_tbi )
{
@@ -478,8 +478,8 @@ static void destroy_data(args_t *args)
free(args->fh_out);
free(args->fnames);
if ( args->fh_sites ) fclose(args->fh_sites);
- if ( args->write ) free(args->write);
}
+ free(args->write);
}
static void usage(void)
@@ -489,7 +489,7 @@ static void usage(void)
fprintf(bcftools_stderr, "Usage: bcftools isec [options] [...]\n");
fprintf(bcftools_stderr, "\n");
fprintf(bcftools_stderr, "Options:\n");
- fprintf(bcftools_stderr, " -c, --collapse STRING Treat as identical records with , see man page for details [none]\n");
+ fprintf(bcftools_stderr, " -c, --collapse STRING Treat as identical records with , see man page for details [none]\n");
fprintf(bcftools_stderr, " -C, --complement Output positions present only in the first file but missing in the others\n");
fprintf(bcftools_stderr, " -e, --exclude EXPR Exclude sites for which the expression is true\n");
fprintf(bcftools_stderr, " -f, --apply-filters LIST Require at least one of the listed FILTER strings (e.g. \"PASS,.\")\n");
@@ -506,7 +506,8 @@ static void usage(void)
fprintf(bcftools_stderr, " -t, --targets REGION Similar to -r but streams rather than index-jumps\n");
fprintf(bcftools_stderr, " -T, --targets-file FILE Similar to -R but streams rather than index-jumps\n");
fprintf(bcftools_stderr, " --targets-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n");
- fprintf(bcftools_stderr, " --threads INT Use multithreading with worker threads [0]\n");
+ fprintf(bcftools_stderr, " --threads INT Use multithreading with INT worker threads [0]\n");
+ fprintf(bcftools_stderr, " -v, --verbosity INT Verbosity level\n");
fprintf(bcftools_stderr, " -w, --write LIST List of files to write with -p given as 1-based indexes. By default, all files are written\n");
fprintf(bcftools_stderr, " -W, --write-index[=FMT] Automatically index the output files [off]\n");
fprintf(bcftools_stderr, "\n");
@@ -567,11 +568,15 @@ int main_vcfisec(int argc, char *argv[])
{"threads",required_argument,NULL,9},
{"no-version",no_argument,NULL,8},
{"write-index",optional_argument,NULL,'W'},
+ {"verbosity",required_argument,NULL,'v'},
{NULL,0,NULL,0}
};
char *tmp;
- while ((c = getopt_long(argc, argv, "hc:r:R:p:n:w:t:T:Cf:o:O:i:e:l:W::",loptions,NULL)) >= 0) {
+ while ((c = getopt_long(argc, argv, "hc:r:R:p:n:w:t:T:Cf:o:O:i:e:l:W::v:",loptions,NULL)) >= 0) {
switch (c) {
+ case 'v':
+ if ( apply_verbosity(optarg) < 0 ) error("Could not parse argument: --verbosity %s\n", optarg);
+ break;
case 'o': args->output_fname = optarg; break;
case 'O':
switch (optarg[0]) {
@@ -599,6 +604,7 @@ int main_vcfisec(int argc, char *argv[])
else if ( !strcmp(optarg,"all") ) args->files->collapse |= COLLAPSE_ANY;
else if ( !strcmp(optarg,"some") ) args->files->collapse |= COLLAPSE_SOME;
else if ( !strcmp(optarg,"none") ) args->files->collapse = COLLAPSE_NONE;
+ else if ( !strcmp(optarg,"id") ) args->files->collapse |= BCF_SR_PAIR_ID;
else error("The --collapse string \"%s\" not recognised.\n", optarg);
break;
case 'f': args->files->apply_filters = optarg; break;
diff --git a/bcftools/vcfmerge.c b/bcftools/vcfmerge.c
index 3ca5f287..ce6a71c9 100644
--- a/bcftools/vcfmerge.c
+++ b/bcftools/vcfmerge.c
@@ -1,6 +1,6 @@
/* vcfmerge.c -- Merge multiple VCF/BCF files to create one multi-sample file.
- Copyright (C) 2012-2024 Genome Research Ltd.
+ Copyright (C) 2012-2025 Genome Research Ltd.
Author: Petr Danecek
@@ -132,6 +132,7 @@ typedef struct
int mrec; // allocated size of buf
maux1_t *rec; // buffer to keep reader's lines
bcf1_t **lines; // source buffer: either gvcf or readers' buffer
+ bcf_hdr_t *hdr; // this reader's header
int var_types; // reader's variant types in the active [beg,end] window
}
buffer_t;
@@ -871,7 +872,10 @@ maux_t *maux_init(args_t *args)
ma->smpl_nGsize = (int*) malloc(n_smpl*sizeof(int));
ma->buf = (buffer_t*) calloc(ma->n,sizeof(buffer_t));
for (i=0; in; i++)
+ {
ma->buf[i].rid = -1;
+ ma->buf[i].hdr = files->readers[i].header;
+ }
ma->str = (kstring_t*) calloc(n_smpl,sizeof(kstring_t));
if ( args->local_alleles )
{
@@ -1759,7 +1763,11 @@ void merge_GT(args_t *args, bcf_fmt_t **fmt_map, bcf1_t *out)
type_t val = convert(&p_ori[k * sizeof(type_t)]); \
if ( val==vector_end ) break; /* smaller ploidy */ \
ma->smpl_ploidy[ismpl+j]++; \
- if ( bcf_gt_is_missing(val) ) tmp[k] = 0; /* missing allele */ \
+ if ( bcf_gt_is_missing(val) ) \
+ { \
+ if ( bcf_gt_is_phased(val) ) tmp[k] = 1; /* missing allele, phased */ \
+ else tmp[k] = 0; /* missing allele, unphased */ \
+ } \
else tmp[k] = val; \
} \
for (; ksmpl_ploidy[ismpl+j]++; \
- if ( bcf_gt_is_missing(val) ) tmp[k] = 0; /* missing allele */ \
+ if ( bcf_gt_is_missing(val) ) \
+ { \
+ if ( bcf_gt_is_phased(val) ) tmp[k] = 1; /* missing allele, phased */ \
+ else tmp[k] = 0; /* missing allele, unphased */ \
+ } \
else \
{ \
int al = (val>>1) - 1; \
@@ -2711,20 +2723,33 @@ void gvcf_flush(args_t *args, int done)
}
}
-static inline int is_gvcf_block(bcf1_t *line)
+static inline int is_gvcf_block(args_t *args, bcf1_t *line)
{
+ maux_t *ma;
+
if ( line->rlen<=1 ) return 0;
if ( strlen(line->d.allele[0])==line->rlen ) return 0;
- if ( line->n_allele==1 ) return 1;
+ if ( line->n_allele==1 ) goto is_gvcf;
int i;
for (i=1; in_allele; i++)
{
- if ( !strcmp(line->d.allele[i],"<*>") ) return 1;
- if ( !strcmp(line->d.allele[i],"") ) return 1;
- if ( !strcmp(line->d.allele[i],"") ) return 1;
+ if ( !strcmp(line->d.allele[i],"<*>") ) goto is_gvcf;
+ if ( !strcmp(line->d.allele[i],"") ) goto is_gvcf;
+ if ( !strcmp(line->d.allele[i],"") ) goto is_gvcf;
}
return 0;
+
+is_gvcf:
+ ma = args->maux;
+ if ( !ma->gvcf )
+ {
+ args->do_gvcf = 1;
+ ma->gvcf = (gvcf_aux_t*) calloc(ma->n,sizeof(gvcf_aux_t)); // -Walloc-size-larger-than gives a harmless warning caused by signed integer ma->n
+ for (i=0; in; i++)
+ ma->gvcf[i].line = bcf_init1();
+ }
+ return 1;
}
/*
@@ -2764,7 +2789,7 @@ void gvcf_stage(args_t *args, int pos)
int irec = maux->buf[i].beg;
bcf_hdr_t *hdr = bcf_sr_get_header(files, i);
bcf1_t *line = args->files->readers[i].buffer[irec];
- int ret = is_gvcf_block(line) ? bcf_get_info_int32(hdr,line,"END",&end,&nend) : 0;
+ int ret = is_gvcf_block(args,line) ? bcf_get_info_int32(hdr,line,"END",&end,&nend) : 0;
if ( ret==1 )
{
if ( end[0] == line->pos + 1 ) // POS and INFO/END are identical, treat as if a normal w/o INFO/END
@@ -2917,23 +2942,48 @@ static const int
indel_mask = (VCF_INDEL<<1),
ins_mask = VCF_INS<<1,
del_mask = VCF_DEL<<1,
- ref_mask = 1;
+ ref_mask = 1,
+ other_mask = VCF_OTHER<<1;
+
+typedef struct
+{
+ int types, // selected types, see the *_mask(s) above
+ end; // if symbolic allele is involved, the END coordinate of the first record
+ bcf1_t *rec; // the first record selected
+}
+selected_t;
// Can these types be merged given the -m settings? Despite the function's name, its focus is on
// excluding incompatible records, there will be a finer matching later in stage_line()
-static inline int types_compatible(args_t *args, int selected_types, buffer_t *buf, int irec)
+static inline int types_compatible(args_t *args, selected_t *selected, buffer_t *buf, int irec)
{
int k;
maux_t *maux = args->maux;
bcf1_t *rec = buf->lines[irec];
int rec_types = buf->rec[irec].var_types;
- assert( selected_types ); // this is trivially true, set in can_merge()
+ int end = -1;
+ if ( rec_types&other_mask )
+ {
+ int32_t *itmp = NULL, nitmp = 0;
+ bcf_get_info_int32(buf->hdr,rec,"END",&itmp,&nitmp);
+ end = nitmp==1 ? itmp[0] : -1;
+ free(itmp);
+ }
+
+ // First time here?
+ if ( !selected->types )
+ {
+ selected->end = end;
+ selected->rec = rec;
+ selected->types = rec_types;
+ return 1;
+ }
if ( args->collapse & COLLAPSE_ANY ) return 1; // can merge anything with anything
// REF and gVCF_REF with no other alleles present can be merged with anything
- if ( (selected_types&ref_mask) && !(selected_types&(~ref_mask)) ) return 1;
+ if ( (selected->types&ref_mask) && !(selected->types&(~ref_mask)) ) return 1;
if ( (rec_types&ref_mask) && !(rec_types&(~ref_mask)) ) return 1;
if ( args->collapse!=COLLAPSE_NONE )
@@ -2944,26 +2994,26 @@ static inline int types_compatible(args_t *args, int selected_types, buffer_t *b
// - rec has indel, we already have an indel, and -m both,indels,snp-ins-del
if ( args->collapse&(COLLAPSE_SNPS|COLLAPSE_SNP_INS_DEL) )
{
- if ( (rec_types&snp_mask) && (selected_types&snp_mask) ) return 1;
+ if ( (rec_types&snp_mask) && (selected->types&snp_mask) ) return 1;
}
if ( args->collapse&COLLAPSE_INDELS )
{
- if ( (rec_types&indel_mask) && (selected_types&indel_mask) ) return 1;
+ if ( (rec_types&indel_mask) && (selected->types&indel_mask) ) return 1;
}
if ( args->collapse&COLLAPSE_SNP_INS_DEL )
{
- if ( (rec_types&ins_mask) && (selected_types&ins_mask) ) return 1;
- if ( (rec_types&del_mask) && (selected_types&del_mask) ) return 1;
+ if ( (rec_types&ins_mask) && (selected->types&ins_mask) ) return 1;
+ if ( (rec_types&del_mask) && (selected->types&del_mask) ) return 1;
}
// Whatever is left, allow to match if the alleles match exactly
}
// The -m none mode or exact matching requested
// Simple test first: are the variants of the same type?
- int x = selected_types >> 1; // remove REF
- int y = rec_types >> 1; // remove REF
- while ( x && y ) { x>>=1; y>>=1; }
- if ( x || y ) return 0; // the types differ
+ int x = selected->types;
+ int y = rec_types;
+ if ( !(x&y) ) return 0; // no matching type
+ if ( (x&y)!=x && (x&y)!=y ) return 0; // not a subset
if ( vcmp_set_ref(args->vcmp,maux->als[0],rec->d.allele[0]) < 0 ) return 0; // refs are not compatible
for (k=1; kn_allele; k++)
@@ -2972,6 +3022,13 @@ static inline int types_compatible(args_t *args, int selected_types, buffer_t *b
if ( vcmp_find_allele(args->vcmp,maux->als+1,maux->nals-1,rec->d.allele[k])>=0 ) break;
}
if ( k==rec->n_allele ) return 0; // this record has a new allele rec->d.allele[k]
+
+ if ( selected->types&other_mask && rec_types&other_mask )
+ {
+ // both records have symbolic alleles and the alleles are the same
+ if ( selected->end!=end ) return 0;
+ }
+
return 1; // all alleles in rec are also in the records selected thus far, perhaps save for gVCF_REF
}
@@ -3089,7 +3146,7 @@ int can_merge(args_t *args)
var_type &= ~VCF_INDEL;
}
var_type = var_type ? var_type<<1 : ref_mask;
- if ( args->do_gvcf && is_gvcf_block(line) ) var_type |= ref_mask;
+ if ( args->do_gvcf && is_gvcf_block(args,line) ) var_type |= ref_mask;
buf->rec[j].var_types = var_type;
}
maux->var_types |= buf->rec[j].var_types;
@@ -3098,7 +3155,7 @@ int can_merge(args_t *args)
}
if ( !ntodo ) return 0;
- int selected_types = 0;
+ selected_t selected = {0,0,NULL};
// In this loop we select from each reader compatible candidate lines.
// (i.e. SNPs or indels). Go through all files and all lines at this
@@ -3113,7 +3170,7 @@ int can_merge(args_t *args)
gaux[i].line->d.allele[0][0] = ref;
gaux[i].line->pos = maux->pos;
maux_update_alleles(args, i, buf->beg);
- selected_types |= ref_mask;
+ selected.types |= ref_mask;
continue;
}
for (j=buf->beg; jend; j++)
@@ -3128,16 +3185,25 @@ int can_merge(args_t *args)
{
if ( strcmp(id,line->d.id) ) continue; // matching by ID and it does not match the selected record
}
- else if ( selected_types && !types_compatible(args,selected_types,buf,j) ) continue;
- else
- {
- // First time here, choosing the first line: prioritize SNPs when available in the -m snps,both modes
- if ( (args->collapse&COLLAPSE_SNPS || args->collapse==COLLAPSE_NONE) // asked to merge SNVs into multiallelics
- && (maux->var_types&snp_mask) // there are SNVs at the current position
- && !(buf->rec[j].var_types&(snp_mask|ref_mask)) // and this record is not a SNV nor ref
- ) continue;
- }
- selected_types |= line_types;
+ else if ( !types_compatible(args,&selected,buf,j) ) continue;
+
+ // This is not a good code. It makes the incorrect assumption of always having a SNP record available.
+ // However, that is not always the case and prevents the merging of G>GT,T with G>GT (see test/merge.multiallelics.1.*.vcf).
+ // We'd need to first check if it is possible to merge with something at all, and only then start excluding.
+ // Anyway, the can_merge() function should be about a *possibility*, one might argue that the priority should be handled in
+ // the stage_line() function.
+ // Commenting this out makes only one difference in our test case: reorders the output lines so that indels can come first.
+ //
+ // else
+ // {
+ // // First time here, choosing the first line: prioritize SNPs when available in the -m snps,both modes
+ // if ( (args->collapse&COLLAPSE_SNPS || args->collapse==COLLAPSE_NONE) // asked to merge SNVs into multiallelics
+ // && (maux->var_types&snp_mask) // there are SNVs at the current position
+ // && !(buf->rec[j].var_types&(snp_mask|ref_mask)) // and this record is not a SNV nor ref
+ // ) continue;
+ // }
+
+ selected.types |= line_types;
buf->rec[j].skip = 0; // the j-th record from i-th reader can be included. Final decision will be made in stage_line
maux_update_alleles(args, i, j);
@@ -3180,7 +3246,8 @@ void stage_line(args_t *args)
if ( buf->rec[j].skip )
{
int is_gvcf = maux->gvcf && maux->gvcf[i].active ? 1 : 0;
- if ( !is_gvcf && is_gvcf_block(buf->lines[j]) ) is_gvcf = 1;
+ if ( !is_gvcf && is_gvcf_block(args,buf->lines[j]) ) is_gvcf = 1;
+ if ( is_gvcf && buf->rec[j].skip && !maux->gvcf[i].active ) continue;
if ( !is_gvcf ) continue; // done or not compatible
}
if ( args->merge_by_id ) break; // if merging by ID and the line is compatible, the this is THE line
@@ -3371,6 +3438,7 @@ void merge_vcf(args_t *args)
if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno));
if ( args->n_threads ) hts_set_opt(args->out_fh, HTS_OPT_THREAD_POOL, args->files->p); //hts_set_threads(args->out_fh, args->n_threads);
args->out_hdr = bcf_hdr_init("w");
+ bcf_hdr_set_version(args->out_hdr, bcf_hdr_get_version(args->files->readers[0].header));
if ( args->header_fname )
{
@@ -3392,7 +3460,6 @@ void merge_vcf(args_t *args)
info_rules_init(args);
missing_rules_init(args);
- bcf_hdr_set_version(args->out_hdr, bcf_hdr_get_version(args->files->readers[0].header));
if ( bcf_hdr_write(args->out_fh, args->out_hdr)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname);
if ( args->header_only )
{
@@ -3488,6 +3555,7 @@ static void usage(void)
fprintf(stderr, " -R, --regions-file FILE Restrict to regions listed in a file\n");
fprintf(stderr, " --regions-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1]\n");
fprintf(stderr, " --threads INT Use multithreading with INT worker threads [0]\n");
+ fprintf(stderr, " -v, --verbosity INT Verbosity level\n");
fprintf(stderr, " -W, --write-index[=FMT] Automatically index the output files [off]\n");
fprintf(stderr, "\n");
exit(1);
@@ -3534,11 +3602,15 @@ int main_vcfmerge(int argc, char *argv[])
{"force-single",no_argument,NULL,12},
{"filter-logic",required_argument,NULL,'F'},
{"write-index",optional_argument,NULL,'W'},
+ {"verbosity",required_argument,NULL,'v'},
{NULL,0,NULL,0}
};
char *tmp;
- while ((c = getopt_long(argc, argv, "hm:f:r:R:o:O:i:M:l:g:F:0L:W::",loptions,NULL)) >= 0) {
+ while ((c = getopt_long(argc, argv, "hm:f:r:R:o:O:i:M:l:g:F:0L:W::v:",loptions,NULL)) >= 0) {
switch (c) {
+ case 'v':
+ if ( apply_verbosity(optarg) < 0 ) error("Could not parse argument: --verbosity %s\n", optarg);
+ break;
case 'L':
args->local_alleles = strtol(optarg,&tmp,10);
if ( *tmp ) error("Could not parse argument: --local-alleles %s\n", optarg);
diff --git a/bcftools/vcfmerge.c.pysam.c b/bcftools/vcfmerge.c.pysam.c
index d0802d07..eb310b85 100644
--- a/bcftools/vcfmerge.c.pysam.c
+++ b/bcftools/vcfmerge.c.pysam.c
@@ -2,7 +2,7 @@
/* vcfmerge.c -- Merge multiple VCF/BCF files to create one multi-sample file.
- Copyright (C) 2012-2024 Genome Research Ltd.
+ Copyright (C) 2012-2025 Genome Research Ltd.
Author: Petr Danecek
@@ -134,6 +134,7 @@ typedef struct
int mrec; // allocated size of buf
maux1_t *rec; // buffer to keep reader's lines
bcf1_t **lines; // source buffer: either gvcf or readers' buffer
+ bcf_hdr_t *hdr; // this reader's header
int var_types; // reader's variant types in the active [beg,end] window
}
buffer_t;
@@ -873,7 +874,10 @@ maux_t *maux_init(args_t *args)
ma->smpl_nGsize = (int*) malloc(n_smpl*sizeof(int));
ma->buf = (buffer_t*) calloc(ma->n,sizeof(buffer_t));
for (i=0; in; i++)
+ {
ma->buf[i].rid = -1;
+ ma->buf[i].hdr = files->readers[i].header;
+ }
ma->str = (kstring_t*) calloc(n_smpl,sizeof(kstring_t));
if ( args->local_alleles )
{
@@ -1761,7 +1765,11 @@ void merge_GT(args_t *args, bcf_fmt_t **fmt_map, bcf1_t *out)
type_t val = convert(&p_ori[k * sizeof(type_t)]); \
if ( val==vector_end ) break; /* smaller ploidy */ \
ma->smpl_ploidy[ismpl+j]++; \
- if ( bcf_gt_is_missing(val) ) tmp[k] = 0; /* missing allele */ \
+ if ( bcf_gt_is_missing(val) ) \
+ { \
+ if ( bcf_gt_is_phased(val) ) tmp[k] = 1; /* missing allele, phased */ \
+ else tmp[k] = 0; /* missing allele, unphased */ \
+ } \
else tmp[k] = val; \
} \
for (; ksmpl_ploidy[ismpl+j]++; \
- if ( bcf_gt_is_missing(val) ) tmp[k] = 0; /* missing allele */ \
+ if ( bcf_gt_is_missing(val) ) \
+ { \
+ if ( bcf_gt_is_phased(val) ) tmp[k] = 1; /* missing allele, phased */ \
+ else tmp[k] = 0; /* missing allele, unphased */ \
+ } \
else \
{ \
int al = (val>>1) - 1; \
@@ -2713,20 +2725,33 @@ void gvcf_flush(args_t *args, int done)
}
}
-static inline int is_gvcf_block(bcf1_t *line)
+static inline int is_gvcf_block(args_t *args, bcf1_t *line)
{
+ maux_t *ma;
+
if ( line->rlen<=1 ) return 0;
if ( strlen(line->d.allele[0])==line->rlen ) return 0;
- if ( line->n_allele==1 ) return 1;
+ if ( line->n_allele==1 ) goto is_gvcf;
int i;
for (i=1; in_allele; i++)
{
- if ( !strcmp(line->d.allele[i],"<*>") ) return 1;
- if ( !strcmp(line->d.allele[i],"") ) return 1;
- if ( !strcmp(line->d.allele[i],"") ) return 1;
+ if ( !strcmp(line->d.allele[i],"<*>") ) goto is_gvcf;
+ if ( !strcmp(line->d.allele[i],"") ) goto is_gvcf;
+ if ( !strcmp(line->d.allele[i],"") ) goto is_gvcf;
}
return 0;
+
+is_gvcf:
+ ma = args->maux;
+ if ( !ma->gvcf )
+ {
+ args->do_gvcf = 1;
+ ma->gvcf = (gvcf_aux_t*) calloc(ma->n,sizeof(gvcf_aux_t)); // -Walloc-size-larger-than gives a harmless warning caused by signed integer ma->n
+ for (i=0; in; i++)
+ ma->gvcf[i].line = bcf_init1();
+ }
+ return 1;
}
/*
@@ -2766,7 +2791,7 @@ void gvcf_stage(args_t *args, int pos)
int irec = maux->buf[i].beg;
bcf_hdr_t *hdr = bcf_sr_get_header(files, i);
bcf1_t *line = args->files->readers[i].buffer[irec];
- int ret = is_gvcf_block(line) ? bcf_get_info_int32(hdr,line,"END",&end,&nend) : 0;
+ int ret = is_gvcf_block(args,line) ? bcf_get_info_int32(hdr,line,"END",&end,&nend) : 0;
if ( ret==1 )
{
if ( end[0] == line->pos + 1 ) // POS and INFO/END are identical, treat as if a normal w/o INFO/END
@@ -2919,23 +2944,48 @@ static const int
indel_mask = (VCF_INDEL<<1),
ins_mask = VCF_INS<<1,
del_mask = VCF_DEL<<1,
- ref_mask = 1;
+ ref_mask = 1,
+ other_mask = VCF_OTHER<<1;
+
+typedef struct
+{
+ int types, // selected types, see the *_mask(s) above
+ end; // if symbolic allele is involved, the END coordinate of the first record
+ bcf1_t *rec; // the first record selected
+}
+selected_t;
// Can these types be merged given the -m settings? Despite the function's name, its focus is on
// excluding incompatible records, there will be a finer matching later in stage_line()
-static inline int types_compatible(args_t *args, int selected_types, buffer_t *buf, int irec)
+static inline int types_compatible(args_t *args, selected_t *selected, buffer_t *buf, int irec)
{
int k;
maux_t *maux = args->maux;
bcf1_t *rec = buf->lines[irec];
int rec_types = buf->rec[irec].var_types;
- assert( selected_types ); // this is trivially true, set in can_merge()
+ int end = -1;
+ if ( rec_types&other_mask )
+ {
+ int32_t *itmp = NULL, nitmp = 0;
+ bcf_get_info_int32(buf->hdr,rec,"END",&itmp,&nitmp);
+ end = nitmp==1 ? itmp[0] : -1;
+ free(itmp);
+ }
+
+ // First time here?
+ if ( !selected->types )
+ {
+ selected->end = end;
+ selected->rec = rec;
+ selected->types = rec_types;
+ return 1;
+ }
if ( args->collapse & COLLAPSE_ANY ) return 1; // can merge anything with anything
// REF and gVCF_REF with no other alleles present can be merged with anything
- if ( (selected_types&ref_mask) && !(selected_types&(~ref_mask)) ) return 1;
+ if ( (selected->types&ref_mask) && !(selected->types&(~ref_mask)) ) return 1;
if ( (rec_types&ref_mask) && !(rec_types&(~ref_mask)) ) return 1;
if ( args->collapse!=COLLAPSE_NONE )
@@ -2946,26 +2996,26 @@ static inline int types_compatible(args_t *args, int selected_types, buffer_t *b
// - rec has indel, we already have an indel, and -m both,indels,snp-ins-del
if ( args->collapse&(COLLAPSE_SNPS|COLLAPSE_SNP_INS_DEL) )
{
- if ( (rec_types&snp_mask) && (selected_types&snp_mask) ) return 1;
+ if ( (rec_types&snp_mask) && (selected->types&snp_mask) ) return 1;
}
if ( args->collapse&COLLAPSE_INDELS )
{
- if ( (rec_types&indel_mask) && (selected_types&indel_mask) ) return 1;
+ if ( (rec_types&indel_mask) && (selected->types&indel_mask) ) return 1;
}
if ( args->collapse&COLLAPSE_SNP_INS_DEL )
{
- if ( (rec_types&ins_mask) && (selected_types&ins_mask) ) return 1;
- if ( (rec_types&del_mask) && (selected_types&del_mask) ) return 1;
+ if ( (rec_types&ins_mask) && (selected->types&ins_mask) ) return 1;
+ if ( (rec_types&del_mask) && (selected->types&del_mask) ) return 1;
}
// Whatever is left, allow to match if the alleles match exactly
}
// The -m none mode or exact matching requested
// Simple test first: are the variants of the same type?
- int x = selected_types >> 1; // remove REF
- int y = rec_types >> 1; // remove REF
- while ( x && y ) { x>>=1; y>>=1; }
- if ( x || y ) return 0; // the types differ
+ int x = selected->types;
+ int y = rec_types;
+ if ( !(x&y) ) return 0; // no matching type
+ if ( (x&y)!=x && (x&y)!=y ) return 0; // not a subset
if ( vcmp_set_ref(args->vcmp,maux->als[0],rec->d.allele[0]) < 0 ) return 0; // refs are not compatible
for (k=1; kn_allele; k++)
@@ -2974,6 +3024,13 @@ static inline int types_compatible(args_t *args, int selected_types, buffer_t *b
if ( vcmp_find_allele(args->vcmp,maux->als+1,maux->nals-1,rec->d.allele[k])>=0 ) break;
}
if ( k==rec->n_allele ) return 0; // this record has a new allele rec->d.allele[k]
+
+ if ( selected->types&other_mask && rec_types&other_mask )
+ {
+ // both records have symbolic alleles and the alleles are the same
+ if ( selected->end!=end ) return 0;
+ }
+
return 1; // all alleles in rec are also in the records selected thus far, perhaps save for gVCF_REF
}
@@ -3091,7 +3148,7 @@ int can_merge(args_t *args)
var_type &= ~VCF_INDEL;
}
var_type = var_type ? var_type<<1 : ref_mask;
- if ( args->do_gvcf && is_gvcf_block(line) ) var_type |= ref_mask;
+ if ( args->do_gvcf && is_gvcf_block(args,line) ) var_type |= ref_mask;
buf->rec[j].var_types = var_type;
}
maux->var_types |= buf->rec[j].var_types;
@@ -3100,7 +3157,7 @@ int can_merge(args_t *args)
}
if ( !ntodo ) return 0;
- int selected_types = 0;
+ selected_t selected = {0,0,NULL};
// In this loop we select from each reader compatible candidate lines.
// (i.e. SNPs or indels). Go through all files and all lines at this
@@ -3115,7 +3172,7 @@ int can_merge(args_t *args)
gaux[i].line->d.allele[0][0] = ref;
gaux[i].line->pos = maux->pos;
maux_update_alleles(args, i, buf->beg);
- selected_types |= ref_mask;
+ selected.types |= ref_mask;
continue;
}
for (j=buf->beg; jend; j++)
@@ -3130,16 +3187,25 @@ int can_merge(args_t *args)
{
if ( strcmp(id,line->d.id) ) continue; // matching by ID and it does not match the selected record
}
- else if ( selected_types && !types_compatible(args,selected_types,buf,j) ) continue;
- else
- {
- // First time here, choosing the first line: prioritize SNPs when available in the -m snps,both modes
- if ( (args->collapse&COLLAPSE_SNPS || args->collapse==COLLAPSE_NONE) // asked to merge SNVs into multiallelics
- && (maux->var_types&snp_mask) // there are SNVs at the current position
- && !(buf->rec[j].var_types&(snp_mask|ref_mask)) // and this record is not a SNV nor ref
- ) continue;
- }
- selected_types |= line_types;
+ else if ( !types_compatible(args,&selected,buf,j) ) continue;
+
+ // This is not a good code. It makes the incorrect assumption of always having a SNP record available.
+ // However, that is not always the case and prevents the merging of G>GT,T with G>GT (see test/merge.multiallelics.1.*.vcf).
+ // We'd need to first check if it is possible to merge with something at all, and only then start excluding.
+ // Anyway, the can_merge() function should be about a *possibility*, one might argue that the priority should be handled in
+ // the stage_line() function.
+ // Commenting this out makes only one difference in our test case: reorders the output lines so that indels can come first.
+ //
+ // else
+ // {
+ // // First time here, choosing the first line: prioritize SNPs when available in the -m snps,both modes
+ // if ( (args->collapse&COLLAPSE_SNPS || args->collapse==COLLAPSE_NONE) // asked to merge SNVs into multiallelics
+ // && (maux->var_types&snp_mask) // there are SNVs at the current position
+ // && !(buf->rec[j].var_types&(snp_mask|ref_mask)) // and this record is not a SNV nor ref
+ // ) continue;
+ // }
+
+ selected.types |= line_types;
buf->rec[j].skip = 0; // the j-th record from i-th reader can be included. Final decision will be made in stage_line
maux_update_alleles(args, i, j);
@@ -3182,7 +3248,8 @@ void stage_line(args_t *args)
if ( buf->rec[j].skip )
{
int is_gvcf = maux->gvcf && maux->gvcf[i].active ? 1 : 0;
- if ( !is_gvcf && is_gvcf_block(buf->lines[j]) ) is_gvcf = 1;
+ if ( !is_gvcf && is_gvcf_block(args,buf->lines[j]) ) is_gvcf = 1;
+ if ( is_gvcf && buf->rec[j].skip && !maux->gvcf[i].active ) continue;
if ( !is_gvcf ) continue; // done or not compatible
}
if ( args->merge_by_id ) break; // if merging by ID and the line is compatible, the this is THE line
@@ -3373,6 +3440,7 @@ void merge_vcf(args_t *args)
if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno));
if ( args->n_threads ) hts_set_opt(args->out_fh, HTS_OPT_THREAD_POOL, args->files->p); //hts_set_threads(args->out_fh, args->n_threads);
args->out_hdr = bcf_hdr_init("w");
+ bcf_hdr_set_version(args->out_hdr, bcf_hdr_get_version(args->files->readers[0].header));
if ( args->header_fname )
{
@@ -3394,7 +3462,6 @@ void merge_vcf(args_t *args)
info_rules_init(args);
missing_rules_init(args);
- bcf_hdr_set_version(args->out_hdr, bcf_hdr_get_version(args->files->readers[0].header));
if ( bcf_hdr_write(args->out_fh, args->out_hdr)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname);
if ( args->header_only )
{
@@ -3490,6 +3557,7 @@ static void usage(void)
fprintf(bcftools_stderr, " -R, --regions-file FILE Restrict to regions listed in a file\n");
fprintf(bcftools_stderr, " --regions-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1]\n");
fprintf(bcftools_stderr, " --threads INT Use multithreading with INT worker threads [0]\n");
+ fprintf(bcftools_stderr, " -v, --verbosity INT Verbosity level\n");
fprintf(bcftools_stderr, " -W, --write-index[=FMT] Automatically index the output files [off]\n");
fprintf(bcftools_stderr, "\n");
bcftools_exit(1);
@@ -3536,11 +3604,15 @@ int main_vcfmerge(int argc, char *argv[])
{"force-single",no_argument,NULL,12},
{"filter-logic",required_argument,NULL,'F'},
{"write-index",optional_argument,NULL,'W'},
+ {"verbosity",required_argument,NULL,'v'},
{NULL,0,NULL,0}
};
char *tmp;
- while ((c = getopt_long(argc, argv, "hm:f:r:R:o:O:i:M:l:g:F:0L:W::",loptions,NULL)) >= 0) {
+ while ((c = getopt_long(argc, argv, "hm:f:r:R:o:O:i:M:l:g:F:0L:W::v:",loptions,NULL)) >= 0) {
switch (c) {
+ case 'v':
+ if ( apply_verbosity(optarg) < 0 ) error("Could not parse argument: --verbosity %s\n", optarg);
+ break;
case 'L':
args->local_alleles = strtol(optarg,&tmp,10);
if ( *tmp ) error("Could not parse argument: --local-alleles %s\n", optarg);
diff --git a/bcftools/vcfnorm.c b/bcftools/vcfnorm.c
index f4725338..cabbebfb 100644
--- a/bcftools/vcfnorm.c
+++ b/bcftools/vcfnorm.c
@@ -1,6 +1,6 @@
/* vcfnorm.c -- Left-align and normalize indels.
- Copyright (C) 2013-2024 Genome Research Ltd.
+ Copyright (C) 2013-2025 Genome Research Ltd.
Author: Petr Danecek
@@ -93,7 +93,7 @@ typedef struct
int32_t *int32_arr;
int ntmp_arr1, ntmp_arr2, nint32_arr;
kstring_t *tmp_str;
- kstring_t *tmp_als, *tmp_sym, tmp_kstr;
+ kstring_t *tmp_als, *tmp_sym, tmp_kstr, old_rec_tag_kstr;
int ntmp_als, ntmp_sym;
rbuf_t rbuf;
int buf_win; // maximum distance between two records to consider
@@ -105,7 +105,7 @@ typedef struct
struct { int tot, set, swap; } nref;
char **argv, *output_fname, *ref_fname, *vcf_fname, *region, *targets;
int argc, rmdup, output_type, n_threads, check_ref, strict_filter, do_indels, clevel;
- int nchanged, nskipped, nsplit, njoined, ntotal, nfilter, mrows_op, mrows_collapse, parsimonious;
+ int nchanged, nskipped, nsplit, njoined, ntotal, nfilter, nrmdup, mrows_op, mrows_collapse, parsimonious;
int record_cmd_line, force, force_warned, keep_sum_ad;
abuf_t *abuf;
abuf_opt_t atomize;
@@ -113,7 +113,7 @@ typedef struct
char *old_rec_tag;
htsFile *out;
char *index_fn;
- int write_index, gff_verbosity;
+ int write_index, verbose;
int right_align;
char *gff_fname;
gff_t *gff;
@@ -127,6 +127,42 @@ typedef struct
}
args_t;
+static void old_rec_tag_init(args_t *args, bcf1_t *line)
+{
+ if ( !args->old_rec_tag ) return;
+
+ args->old_rec_tag_kstr.l = 0;
+ ksprintf(&args->old_rec_tag_kstr,"%s|%"PRIhts_pos"|%s|",bcf_seqname(args->hdr,line),line->pos+1,line->d.allele[0]);
+ int i;
+ for (i=1; in_allele; i++)
+ {
+ kputs(line->d.allele[i],&args->old_rec_tag_kstr);
+ if ( i+1n_allele ) kputc(',',&args->old_rec_tag_kstr);
+ }
+}
+static void old_rec_tag_set(args_t *args, bcf1_t *line, int ialt)
+{
+ if ( !args->old_rec_tag || !args->old_rec_tag_kstr.l ) return;
+
+ // only update if the tag is not present already, there can be multiple normalization steps
+ int i, id = bcf_hdr_id2int(args->out_hdr, BCF_DT_ID, args->old_rec_tag);
+ bcf_unpack(line, BCF_UN_INFO);
+ for (i=0; in_info; i++)
+ {
+ bcf_info_t *inf = &line->d.info[i];
+ if ( inf && inf->key == id ) return;
+ }
+
+ if ( ialt>0 )
+ {
+ kputc('|',&args->old_rec_tag_kstr);
+ kputw(ialt,&args->old_rec_tag_kstr);
+ }
+ if ( (bcf_update_info_string(args->out_hdr, line, args->old_rec_tag, args->old_rec_tag_kstr.s))!=0 )
+ error("An error occurred while updating INFO/%s\n",args->old_rec_tag);
+ args->old_rec_tag_kstr.l = 0;
+}
+
static inline int replace_iupac_codes(char *seq, int nseq)
{
// Replace ambiguity codes with N for now, it awaits to be seen what the VCF spec codifies in the end
@@ -159,7 +195,8 @@ static void seq_to_upper(char *seq, int len)
for (i=0; seq[i]; i++) seq[i] = nt_to_upper(seq[i]);
}
-static void fix_ref(args_t *args, bcf1_t *line)
+// returns 0 when no fix was needed, 1 otherwise
+static int fix_ref(args_t *args, bcf1_t *line)
{
bcf_unpack(line, BCF_UN_STR);
int reflen = strlen(line->d.allele[0]);
@@ -177,7 +214,7 @@ static void fix_ref(args_t *args, bcf1_t *line)
args->nref.tot++;
// is the REF different? If not, we are done
- if ( !strncasecmp(line->d.allele[0],ref,reflen) ) { free(ref); return; }
+ if ( !strncasecmp(line->d.allele[0],ref,reflen) ) { free(ref); return 0; }
// is the REF allele missing?
if ( reflen==1 && line->d.allele[0][0]=='.' )
@@ -186,11 +223,11 @@ static void fix_ref(args_t *args, bcf1_t *line)
args->nref.set++;
free(ref);
bcf_update_alleles(args->out_hdr,line,(const char**)line->d.allele,line->n_allele);
- return;
+ return 1;
}
// does REF or ALT contain non-standard bases?
- int has_non_acgtn = 0;
+ int ret = 0, has_non_acgtn = 0;
for (i=0; in_allele; i++)
{
if ( line->d.allele[i][0]=='<' ) continue;
@@ -200,7 +237,8 @@ static void fix_ref(args_t *args, bcf1_t *line)
{
args->nref.set++;
bcf_update_alleles(args->out_hdr,line,(const char**)line->d.allele,line->n_allele);
- if ( !strncasecmp(line->d.allele[0],ref,reflen) ) { free(ref); return; }
+ if ( !strncasecmp(line->d.allele[0],ref,reflen) ) { free(ref); return 1; }
+ ret = 1;
}
// does the REF allele contain N's ?
@@ -221,12 +259,12 @@ static void fix_ref(args_t *args, bcf1_t *line)
}
if ( fix )
{
+ ret = 1;
args->nref.set++;
bcf_update_alleles(args->out_hdr,line,(const char**)line->d.allele,line->n_allele);
- if ( !strncasecmp(line->d.allele[0],ref,reflen) ) { free(ref); return; }
+ if ( !strncasecmp(line->d.allele[0],ref,reflen) ) { free(ref); return ret; }
}
-
// is it swapped?
for (i=1; in_allele; i++)
{
@@ -237,6 +275,7 @@ static void fix_ref(args_t *args, bcf1_t *line)
kstring_t str = {0,0,0};
if ( i==line->n_allele ) // none of the alternate alleles matches the reference
{
+ ret = 1;
args->nref.set++;
kputsn(ref,reflen,&str);
for (i=1; in_allele; i++)
@@ -247,7 +286,7 @@ static void fix_ref(args_t *args, bcf1_t *line)
bcf_update_alleles_str(args->out_hdr,line,str.s);
free(ref);
free(str.s);
- return;
+ return ret;
}
// one of the alternate alleles matches the reference, assume it's a simple swap
@@ -289,6 +328,7 @@ static void fix_ref(args_t *args, bcf1_t *line)
ac[i-1] = ni;
bcf_update_info_int32(args->out_hdr, line, "AC", ac, nac);
}
+ return 1;
}
static void fix_dup_alt(args_t *args, bcf1_t *line)
@@ -334,41 +374,41 @@ static void fix_dup_alt(args_t *args, bcf1_t *line)
if ( changed ) bcf_update_genotypes(args->out_hdr,line,gts,ngts);
}
-static void set_old_rec_tag(args_t *args, bcf1_t *dst, bcf1_t *src, int ialt)
-{
- if ( !args->old_rec_tag ) return;
-
- // only update if the tag is not present already, there can be multiple normalization steps
- int i, id = bcf_hdr_id2int(args->out_hdr, BCF_DT_ID, args->old_rec_tag);
- bcf_unpack(dst, BCF_UN_INFO);
- for (i=0; in_info; i++)
- {
- bcf_info_t *inf = &dst->d.info[i];
- if ( inf && inf->key == id ) return;
- }
-
- args->tmp_kstr.l = 0;
- ksprintf(&args->tmp_kstr,"%s|%"PRIhts_pos"|%s|",bcf_seqname(args->hdr,src),src->pos+1,src->d.allele[0]);
- for (i=1; in_allele; i++)
- {
- kputs(src->d.allele[i],&args->tmp_kstr);
- if ( i+1n_allele ) kputc(',',&args->tmp_kstr);
- }
- if ( ialt>0 )
- {
- kputc('|',&args->tmp_kstr);
- kputw(ialt,&args->tmp_kstr);
- }
- if ( (bcf_update_info_string(args->out_hdr, dst, args->old_rec_tag, args->tmp_kstr.s))!=0 )
- error("An error occurred while updating INFO/%s\n",args->old_rec_tag);
-}
+// static void set_old_rec_tag(args_t *args, bcf1_t *dst, bcf1_t *src, int ialt)
+// {
+// fprintf(stderr,"remove me\n");
+// if ( !args->old_rec_tag ) return;
+//
+// // only update if the tag is not present already, there can be multiple normalization steps
+// int i, id = bcf_hdr_id2int(args->out_hdr, BCF_DT_ID, args->old_rec_tag);
+// bcf_unpack(dst, BCF_UN_INFO);
+// for (i=0; in_info; i++)
+// {
+// bcf_info_t *inf = &dst->d.info[i];
+// if ( inf && inf->key == id ) return;
+// }
+//
+// args->tmp_kstr.l = 0;
+// ksprintf(&args->tmp_kstr,"%s|%"PRIhts_pos"|%s|",bcf_seqname(args->hdr,src),src->pos+1,src->d.allele[0]);
+// for (i=1; in_allele; i++)
+// {
+// kputs(src->d.allele[i],&args->tmp_kstr);
+// if ( i+1n_allele ) kputc(',',&args->tmp_kstr);
+// }
+// if ( ialt>0 )
+// {
+// kputc('|',&args->tmp_kstr);
+// kputw(ialt,&args->tmp_kstr);
+// }
+// if ( (bcf_update_info_string(args->out_hdr, dst, args->old_rec_tag, args->tmp_kstr.s))!=0 )
+// error("An error occurred while updating INFO/%s\n",args->old_rec_tag);
+// }
static int is_left_align(args_t *args, bcf1_t *line)
{
if ( args->right_align ) return 0;
if ( !args->gff ) return 1;
const char *chr = bcf_seqname(args->hdr,line);
- if ( !strncasecmp("chr",chr,3) ) chr += 3; // strip 'chr' prefix, that's what we requested the GFF reader to do
if ( !regidx_overlap(args->idx_tscript,chr,line->pos,line->pos+line->rlen, args->itr_tscript) ) return 1;
// if there are two conflicting overlapping transcripts, go with the default left-alignment
@@ -523,6 +563,7 @@ static hts_pos_t realign_right(args_t *args, bcf1_t *line)
static int realign(args_t *args, bcf1_t *line)
{
bcf_unpack(line, BCF_UN_STR);
+ old_rec_tag_init(args,line);
// Sanity check REF
int i, nref, reflen = strlen(line->d.allele[0]);
@@ -655,7 +696,7 @@ static int realign(args_t *args, bcf1_t *line)
}
if ( new_pos==line->pos && !strcasecmp(line->d.allele[0],als[0].s) ) return ERR_OK;
- set_old_rec_tag(args, line, line, 0);
+ old_rec_tag_set(args, line, 0);
// Create new block of alleles and update
args->tmp_kstr.l = 0;
@@ -1247,6 +1288,7 @@ static void split_multiallelic_to_biallelics(args_t *args, bcf1_t *line)
if ( !args->tmp_lines[i] ) args->tmp_lines[i] = bcf_init1();
bcf1_t *dst = args->tmp_lines[i];
bcf_clear(dst);
+ old_rec_tag_init(args,line);
dst->rid = line->rid;
dst->pos = line->pos;
@@ -1271,7 +1313,7 @@ static void split_multiallelic_to_biallelics(args_t *args, bcf1_t *line)
else if ( type==BCF_HT_FLAG ) split_info_flag(args, line, info, i, dst);
else split_info_string(args, line, info, i, dst);
}
- set_old_rec_tag(args, dst, line, i + 1); // 1-based indexes
+ old_rec_tag_set(args, dst, i + 1); // 1-based indexes
dst->n_sample = line->n_sample;
for (j=0; jn_fmt; j++)
@@ -2138,10 +2180,10 @@ static void flush_buffer(args_t *args, htsFile *file, int n)
int line_type = bcf_get_variant_types(args->lines[k]);
if ( prev_rid>=0 && prev_rid==args->lines[k]->rid && prev_pos==args->lines[k]->pos )
{
- if ( args->rmdup & BCF_SR_PAIR_ANY ) continue; // rmdup by position only
- if ( args->rmdup & BCF_SR_PAIR_SNPS && line_type&(VCF_SNP|VCF_MNP) && prev_type&(VCF_SNP|VCF_MNP) ) continue;
- if ( args->rmdup & BCF_SR_PAIR_INDELS && line_type&(VCF_INDEL) && prev_type&(VCF_INDEL) ) continue;
- if ( args->rmdup & BCF_SR_PAIR_EXACT && cmpals_match(args, &args->cmpals_out, args->lines[k]) ) continue;
+ if ( args->rmdup & BCF_SR_PAIR_ANY ) { args->nrmdup++; continue; } // rmdup by position only
+ if ( args->rmdup & BCF_SR_PAIR_SNPS && line_type&(VCF_SNP|VCF_MNP) && prev_type&(VCF_SNP|VCF_MNP) ) { args->nrmdup++; continue; }
+ if ( args->rmdup & BCF_SR_PAIR_INDELS && line_type&(VCF_INDEL) && prev_type&(VCF_INDEL) ) { args->nrmdup++; continue; }
+ if ( args->rmdup & BCF_SR_PAIR_EXACT && cmpals_match(args, &args->cmpals_out, args->lines[k]) ) { args->nrmdup++; continue; }
}
else
{
@@ -2190,6 +2232,15 @@ static void init_data(args_t *args)
args->tmp_str = (kstring_t*) calloc(bcf_hdr_nsamples(args->hdr),sizeof(kstring_t));
args->diploid = (uint8_t*) malloc(bcf_hdr_nsamples(args->hdr));
}
+ if ( args->mrows_op==MROWS_SPLIT )
+ {
+ // check the sanity of splitted fields, specifically of SVLEN (#2371)
+ int id = bcf_hdr_id2int(args->hdr,BCF_DT_ID,"SVLEN");
+ if ( id>=0 && bcf_hdr_id2length(args->hdr,BCF_HL_INFO,id)!=BCF_VL_A )
+ fprintf(stderr,
+ "Warning: the tag INFO/SVLEN must be defined as Number=A in order for the field to be split\n"
+ " (the command `bcftools reheader` can be used to fix the header)\n");
+ }
if ( args->atomize==SPLIT )
{
args->abuf = abuf_init(args->hdr, SPLIT);
@@ -2204,8 +2255,7 @@ static void init_data(args_t *args)
if ( args->gff_fname )
{
args->gff = gff_init(args->gff_fname);
- gff_set(args->gff,verbosity,args->gff_verbosity);
- gff_set(args->gff,strip_chr_names,1);
+ gff_set(args->gff,verbosity,args->verbose);
gff_parse(args->gff);
args->idx_tscript = gff_get(args->gff,idx_tscript);
args->itr_tscript = regitr_init(NULL);
@@ -2246,6 +2296,7 @@ static void destroy_data(args_t *args)
free(args->tmp_als);
free(args->tmp_sym);
free(args->tmp_kstr.s);
+ free(args->old_rec_tag_kstr.s);
if ( args->tmp_str )
{
for (i=0; ihdr); i++) free(args->tmp_str[i].s);
@@ -2269,7 +2320,11 @@ static void normalize_line(args_t *args, bcf1_t *line)
{
if ( args->fai )
{
- if ( args->filter_pass && (args->check_ref & CHECK_REF_FIX) ) fix_ref(args, line);
+ if ( args->filter_pass && (args->check_ref & CHECK_REF_FIX) )
+ {
+ old_rec_tag_init(args,line);
+ if ( fix_ref(args,line) ) old_rec_tag_set(args,line,0);
+ }
if ( args->do_indels )
{
int ret = args->filter_pass ? realign(args, line) : ERR_OK;
@@ -2425,8 +2480,8 @@ static void normalize_vcf(args_t *args)
}
if ( hts_close(args->out)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname);
- fprintf(stderr,"Lines total/split/joined/realigned/removed/skipped:\t%d/%d/%d/%d/%d/%d\n",
- args->ntotal,args->nsplit,args->njoined,args->nchanged,args->nskipped,args->nfilter);
+ fprintf(stderr,"Lines total/split/joined/realigned/mismatch_removed/dup_removed/skipped:\t%d/%d/%d/%d/%d/%d/%d\n",
+ args->ntotal,args->nsplit,args->njoined,args->nchanged,args->nskipped,args->nrmdup,args->nfilter);
if ( args->check_ref & CHECK_REF_FIX )
fprintf(stderr,"REF/ALT total/modified/added: \t%d/%d/%d\n", args->nref.tot,args->nref.swap,args->nref.set);
}
@@ -2467,7 +2522,7 @@ static void usage(void)
fprintf(stderr, " -T, --targets-file FILE Similar to -R but streams rather than index-jumps\n");
fprintf(stderr, " --targets-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n");
fprintf(stderr, " --threads INT Use multithreading with INT worker threads [0]\n");
- fprintf(stderr, " -v, --verbose INT Verbosity level (0-2) of GFF parsing [1]\n");
+ fprintf(stderr, " -v, --verbosity INT Verbosity level\n");
fprintf(stderr, " -w, --site-win INT Buffer for sorting lines which changed position during realignment [1000]\n");
fprintf(stderr, " -W, --write-index[=FMT] Automatically index the output files [off]\n");
fprintf(stderr, "\n");
@@ -2500,7 +2555,7 @@ int main_vcfnorm(int argc, char *argv[])
int region_is_file = 0;
int targets_is_file = 0;
args->use_star_allele = 1;
- args->gff_verbosity = 1;
+ args->verbose = 1;
int regions_overlap = 1;
int targets_overlap = 0;
args->cmp_func = cmp_bcf_pos;
@@ -2539,6 +2594,7 @@ int main_vcfnorm(int argc, char *argv[])
{"no-version",no_argument,NULL,8},
{"write-index",optional_argument,NULL,'W'},
{"verbose",required_argument,NULL,'v'},
+ {"verbosity",required_argument,NULL,'v'},
{NULL,0,NULL,0}
};
char *tmp;
@@ -2552,8 +2608,9 @@ int main_vcfnorm(int argc, char *argv[])
break;
case 'g': args->gff_fname = optarg; break;
case 'v':
- args->gff_verbosity = atoi(optarg);
- if ( args->gff_verbosity<0 || args->gff_verbosity>2 ) error("Error: expected integer 0-2 with -v, --verbose\n");
+ args->verbose = strtol(optarg,&tmp,10);
+ if ( *tmp || args->verbose<0 ) error("Could not parse argument: --verbosity %s\n", optarg);
+ if ( args->verbose > 3 ) hts_verbose = args->verbose;
break;
case 'a': args->atomize = SPLIT; break;
case 'e':
@@ -2633,7 +2690,7 @@ int main_vcfnorm(int argc, char *argv[])
break;
case 'o': args->output_fname = optarg; break;
case 'D':
- fprintf(stderr,"Warning: `-D` is functional but deprecated, replaced by and alias of `-d none`.\n");
+ fprintf(stderr,"Warning: `-D` is functional but deprecated, replaced by and alias of `-d exact`.\n");
args->rmdup = BCF_SR_PAIR_EXACT;
break;
case 's': args->strict_filter = 1; break;
diff --git a/bcftools/vcfnorm.c.pysam.c b/bcftools/vcfnorm.c.pysam.c
index 4fe92ec9..dc58bbbb 100644
--- a/bcftools/vcfnorm.c.pysam.c
+++ b/bcftools/vcfnorm.c.pysam.c
@@ -2,7 +2,7 @@
/* vcfnorm.c -- Left-align and normalize indels.
- Copyright (C) 2013-2024 Genome Research Ltd.
+ Copyright (C) 2013-2025 Genome Research Ltd.
Author: Petr Danecek
@@ -95,7 +95,7 @@ typedef struct
int32_t *int32_arr;
int ntmp_arr1, ntmp_arr2, nint32_arr;
kstring_t *tmp_str;
- kstring_t *tmp_als, *tmp_sym, tmp_kstr;
+ kstring_t *tmp_als, *tmp_sym, tmp_kstr, old_rec_tag_kstr;
int ntmp_als, ntmp_sym;
rbuf_t rbuf;
int buf_win; // maximum distance between two records to consider
@@ -107,7 +107,7 @@ typedef struct
struct { int tot, set, swap; } nref;
char **argv, *output_fname, *ref_fname, *vcf_fname, *region, *targets;
int argc, rmdup, output_type, n_threads, check_ref, strict_filter, do_indels, clevel;
- int nchanged, nskipped, nsplit, njoined, ntotal, nfilter, mrows_op, mrows_collapse, parsimonious;
+ int nchanged, nskipped, nsplit, njoined, ntotal, nfilter, nrmdup, mrows_op, mrows_collapse, parsimonious;
int record_cmd_line, force, force_warned, keep_sum_ad;
abuf_t *abuf;
abuf_opt_t atomize;
@@ -115,7 +115,7 @@ typedef struct
char *old_rec_tag;
htsFile *out;
char *index_fn;
- int write_index, gff_verbosity;
+ int write_index, verbose;
int right_align;
char *gff_fname;
gff_t *gff;
@@ -129,6 +129,42 @@ typedef struct
}
args_t;
+static void old_rec_tag_init(args_t *args, bcf1_t *line)
+{
+ if ( !args->old_rec_tag ) return;
+
+ args->old_rec_tag_kstr.l = 0;
+ ksprintf(&args->old_rec_tag_kstr,"%s|%"PRIhts_pos"|%s|",bcf_seqname(args->hdr,line),line->pos+1,line->d.allele[0]);
+ int i;
+ for (i=1; in_allele; i++)
+ {
+ kputs(line->d.allele[i],&args->old_rec_tag_kstr);
+ if ( i+1n_allele ) kputc(',',&args->old_rec_tag_kstr);
+ }
+}
+static void old_rec_tag_set(args_t *args, bcf1_t *line, int ialt)
+{
+ if ( !args->old_rec_tag || !args->old_rec_tag_kstr.l ) return;
+
+ // only update if the tag is not present already, there can be multiple normalization steps
+ int i, id = bcf_hdr_id2int(args->out_hdr, BCF_DT_ID, args->old_rec_tag);
+ bcf_unpack(line, BCF_UN_INFO);
+ for (i=0; in_info; i++)
+ {
+ bcf_info_t *inf = &line->d.info[i];
+ if ( inf && inf->key == id ) return;
+ }
+
+ if ( ialt>0 )
+ {
+ kputc('|',&args->old_rec_tag_kstr);
+ kputw(ialt,&args->old_rec_tag_kstr);
+ }
+ if ( (bcf_update_info_string(args->out_hdr, line, args->old_rec_tag, args->old_rec_tag_kstr.s))!=0 )
+ error("An error occurred while updating INFO/%s\n",args->old_rec_tag);
+ args->old_rec_tag_kstr.l = 0;
+}
+
static inline int replace_iupac_codes(char *seq, int nseq)
{
// Replace ambiguity codes with N for now, it awaits to be seen what the VCF spec codifies in the end
@@ -161,7 +197,8 @@ static void seq_to_upper(char *seq, int len)
for (i=0; seq[i]; i++) seq[i] = nt_to_upper(seq[i]);
}
-static void fix_ref(args_t *args, bcf1_t *line)
+// returns 0 when no fix was needed, 1 otherwise
+static int fix_ref(args_t *args, bcf1_t *line)
{
bcf_unpack(line, BCF_UN_STR);
int reflen = strlen(line->d.allele[0]);
@@ -179,7 +216,7 @@ static void fix_ref(args_t *args, bcf1_t *line)
args->nref.tot++;
// is the REF different? If not, we are done
- if ( !strncasecmp(line->d.allele[0],ref,reflen) ) { free(ref); return; }
+ if ( !strncasecmp(line->d.allele[0],ref,reflen) ) { free(ref); return 0; }
// is the REF allele missing?
if ( reflen==1 && line->d.allele[0][0]=='.' )
@@ -188,11 +225,11 @@ static void fix_ref(args_t *args, bcf1_t *line)
args->nref.set++;
free(ref);
bcf_update_alleles(args->out_hdr,line,(const char**)line->d.allele,line->n_allele);
- return;
+ return 1;
}
// does REF or ALT contain non-standard bases?
- int has_non_acgtn = 0;
+ int ret = 0, has_non_acgtn = 0;
for (i=0; in_allele; i++)
{
if ( line->d.allele[i][0]=='<' ) continue;
@@ -202,7 +239,8 @@ static void fix_ref(args_t *args, bcf1_t *line)
{
args->nref.set++;
bcf_update_alleles(args->out_hdr,line,(const char**)line->d.allele,line->n_allele);
- if ( !strncasecmp(line->d.allele[0],ref,reflen) ) { free(ref); return; }
+ if ( !strncasecmp(line->d.allele[0],ref,reflen) ) { free(ref); return 1; }
+ ret = 1;
}
// does the REF allele contain N's ?
@@ -223,12 +261,12 @@ static void fix_ref(args_t *args, bcf1_t *line)
}
if ( fix )
{
+ ret = 1;
args->nref.set++;
bcf_update_alleles(args->out_hdr,line,(const char**)line->d.allele,line->n_allele);
- if ( !strncasecmp(line->d.allele[0],ref,reflen) ) { free(ref); return; }
+ if ( !strncasecmp(line->d.allele[0],ref,reflen) ) { free(ref); return ret; }
}
-
// is it swapped?
for (i=1; in_allele; i++)
{
@@ -239,6 +277,7 @@ static void fix_ref(args_t *args, bcf1_t *line)
kstring_t str = {0,0,0};
if ( i==line->n_allele ) // none of the alternate alleles matches the reference
{
+ ret = 1;
args->nref.set++;
kputsn(ref,reflen,&str);
for (i=1; in_allele; i++)
@@ -249,7 +288,7 @@ static void fix_ref(args_t *args, bcf1_t *line)
bcf_update_alleles_str(args->out_hdr,line,str.s);
free(ref);
free(str.s);
- return;
+ return ret;
}
// one of the alternate alleles matches the reference, assume it's a simple swap
@@ -291,6 +330,7 @@ static void fix_ref(args_t *args, bcf1_t *line)
ac[i-1] = ni;
bcf_update_info_int32(args->out_hdr, line, "AC", ac, nac);
}
+ return 1;
}
static void fix_dup_alt(args_t *args, bcf1_t *line)
@@ -336,41 +376,41 @@ static void fix_dup_alt(args_t *args, bcf1_t *line)
if ( changed ) bcf_update_genotypes(args->out_hdr,line,gts,ngts);
}
-static void set_old_rec_tag(args_t *args, bcf1_t *dst, bcf1_t *src, int ialt)
-{
- if ( !args->old_rec_tag ) return;
-
- // only update if the tag is not present already, there can be multiple normalization steps
- int i, id = bcf_hdr_id2int(args->out_hdr, BCF_DT_ID, args->old_rec_tag);
- bcf_unpack(dst, BCF_UN_INFO);
- for (i=0; in_info; i++)
- {
- bcf_info_t *inf = &dst->d.info[i];
- if ( inf && inf->key == id ) return;
- }
-
- args->tmp_kstr.l = 0;
- ksprintf(&args->tmp_kstr,"%s|%"PRIhts_pos"|%s|",bcf_seqname(args->hdr,src),src->pos+1,src->d.allele[0]);
- for (i=1; in_allele; i++)
- {
- kputs(src->d.allele[i],&args->tmp_kstr);
- if ( i+1n_allele ) kputc(',',&args->tmp_kstr);
- }
- if ( ialt>0 )
- {
- kputc('|',&args->tmp_kstr);
- kputw(ialt,&args->tmp_kstr);
- }
- if ( (bcf_update_info_string(args->out_hdr, dst, args->old_rec_tag, args->tmp_kstr.s))!=0 )
- error("An error occurred while updating INFO/%s\n",args->old_rec_tag);
-}
+// static void set_old_rec_tag(args_t *args, bcf1_t *dst, bcf1_t *src, int ialt)
+// {
+// fprintf(bcftools_stderr,"remove me\n");
+// if ( !args->old_rec_tag ) return;
+//
+// // only update if the tag is not present already, there can be multiple normalization steps
+// int i, id = bcf_hdr_id2int(args->out_hdr, BCF_DT_ID, args->old_rec_tag);
+// bcf_unpack(dst, BCF_UN_INFO);
+// for (i=0; in_info; i++)
+// {
+// bcf_info_t *inf = &dst->d.info[i];
+// if ( inf && inf->key == id ) return;
+// }
+//
+// args->tmp_kstr.l = 0;
+// ksprintf(&args->tmp_kstr,"%s|%"PRIhts_pos"|%s|",bcf_seqname(args->hdr,src),src->pos+1,src->d.allele[0]);
+// for (i=1; in_allele; i++)
+// {
+// kputs(src->d.allele[i],&args->tmp_kstr);
+// if ( i+1n_allele ) kputc(',',&args->tmp_kstr);
+// }
+// if ( ialt>0 )
+// {
+// kputc('|',&args->tmp_kstr);
+// kputw(ialt,&args->tmp_kstr);
+// }
+// if ( (bcf_update_info_string(args->out_hdr, dst, args->old_rec_tag, args->tmp_kstr.s))!=0 )
+// error("An error occurred while updating INFO/%s\n",args->old_rec_tag);
+// }
static int is_left_align(args_t *args, bcf1_t *line)
{
if ( args->right_align ) return 0;
if ( !args->gff ) return 1;
const char *chr = bcf_seqname(args->hdr,line);
- if ( !strncasecmp("chr",chr,3) ) chr += 3; // strip 'chr' prefix, that's what we requested the GFF reader to do
if ( !regidx_overlap(args->idx_tscript,chr,line->pos,line->pos+line->rlen, args->itr_tscript) ) return 1;
// if there are two conflicting overlapping transcripts, go with the default left-alignment
@@ -525,6 +565,7 @@ static hts_pos_t realign_right(args_t *args, bcf1_t *line)
static int realign(args_t *args, bcf1_t *line)
{
bcf_unpack(line, BCF_UN_STR);
+ old_rec_tag_init(args,line);
// Sanity check REF
int i, nref, reflen = strlen(line->d.allele[0]);
@@ -657,7 +698,7 @@ static int realign(args_t *args, bcf1_t *line)
}
if ( new_pos==line->pos && !strcasecmp(line->d.allele[0],als[0].s) ) return ERR_OK;
- set_old_rec_tag(args, line, line, 0);
+ old_rec_tag_set(args, line, 0);
// Create new block of alleles and update
args->tmp_kstr.l = 0;
@@ -1249,6 +1290,7 @@ static void split_multiallelic_to_biallelics(args_t *args, bcf1_t *line)
if ( !args->tmp_lines[i] ) args->tmp_lines[i] = bcf_init1();
bcf1_t *dst = args->tmp_lines[i];
bcf_clear(dst);
+ old_rec_tag_init(args,line);
dst->rid = line->rid;
dst->pos = line->pos;
@@ -1273,7 +1315,7 @@ static void split_multiallelic_to_biallelics(args_t *args, bcf1_t *line)
else if ( type==BCF_HT_FLAG ) split_info_flag(args, line, info, i, dst);
else split_info_string(args, line, info, i, dst);
}
- set_old_rec_tag(args, dst, line, i + 1); // 1-based indexes
+ old_rec_tag_set(args, dst, i + 1); // 1-based indexes
dst->n_sample = line->n_sample;
for (j=0; jn_fmt; j++)
@@ -2140,10 +2182,10 @@ static void flush_buffer(args_t *args, htsFile *file, int n)
int line_type = bcf_get_variant_types(args->lines[k]);
if ( prev_rid>=0 && prev_rid==args->lines[k]->rid && prev_pos==args->lines[k]->pos )
{
- if ( args->rmdup & BCF_SR_PAIR_ANY ) continue; // rmdup by position only
- if ( args->rmdup & BCF_SR_PAIR_SNPS && line_type&(VCF_SNP|VCF_MNP) && prev_type&(VCF_SNP|VCF_MNP) ) continue;
- if ( args->rmdup & BCF_SR_PAIR_INDELS && line_type&(VCF_INDEL) && prev_type&(VCF_INDEL) ) continue;
- if ( args->rmdup & BCF_SR_PAIR_EXACT && cmpals_match(args, &args->cmpals_out, args->lines[k]) ) continue;
+ if ( args->rmdup & BCF_SR_PAIR_ANY ) { args->nrmdup++; continue; } // rmdup by position only
+ if ( args->rmdup & BCF_SR_PAIR_SNPS && line_type&(VCF_SNP|VCF_MNP) && prev_type&(VCF_SNP|VCF_MNP) ) { args->nrmdup++; continue; }
+ if ( args->rmdup & BCF_SR_PAIR_INDELS && line_type&(VCF_INDEL) && prev_type&(VCF_INDEL) ) { args->nrmdup++; continue; }
+ if ( args->rmdup & BCF_SR_PAIR_EXACT && cmpals_match(args, &args->cmpals_out, args->lines[k]) ) { args->nrmdup++; continue; }
}
else
{
@@ -2192,6 +2234,15 @@ static void init_data(args_t *args)
args->tmp_str = (kstring_t*) calloc(bcf_hdr_nsamples(args->hdr),sizeof(kstring_t));
args->diploid = (uint8_t*) malloc(bcf_hdr_nsamples(args->hdr));
}
+ if ( args->mrows_op==MROWS_SPLIT )
+ {
+ // check the sanity of splitted fields, specifically of SVLEN (#2371)
+ int id = bcf_hdr_id2int(args->hdr,BCF_DT_ID,"SVLEN");
+ if ( id>=0 && bcf_hdr_id2length(args->hdr,BCF_HL_INFO,id)!=BCF_VL_A )
+ fprintf(bcftools_stderr,
+ "Warning: the tag INFO/SVLEN must be defined as Number=A in order for the field to be split\n"
+ " (the command `bcftools reheader` can be used to fix the header)\n");
+ }
if ( args->atomize==SPLIT )
{
args->abuf = abuf_init(args->hdr, SPLIT);
@@ -2206,8 +2257,7 @@ static void init_data(args_t *args)
if ( args->gff_fname )
{
args->gff = gff_init(args->gff_fname);
- gff_set(args->gff,verbosity,args->gff_verbosity);
- gff_set(args->gff,strip_chr_names,1);
+ gff_set(args->gff,verbosity,args->verbose);
gff_parse(args->gff);
args->idx_tscript = gff_get(args->gff,idx_tscript);
args->itr_tscript = regitr_init(NULL);
@@ -2248,6 +2298,7 @@ static void destroy_data(args_t *args)
free(args->tmp_als);
free(args->tmp_sym);
free(args->tmp_kstr.s);
+ free(args->old_rec_tag_kstr.s);
if ( args->tmp_str )
{
for (i=0; ihdr); i++) free(args->tmp_str[i].s);
@@ -2271,7 +2322,11 @@ static void normalize_line(args_t *args, bcf1_t *line)
{
if ( args->fai )
{
- if ( args->filter_pass && (args->check_ref & CHECK_REF_FIX) ) fix_ref(args, line);
+ if ( args->filter_pass && (args->check_ref & CHECK_REF_FIX) )
+ {
+ old_rec_tag_init(args,line);
+ if ( fix_ref(args,line) ) old_rec_tag_set(args,line,0);
+ }
if ( args->do_indels )
{
int ret = args->filter_pass ? realign(args, line) : ERR_OK;
@@ -2427,8 +2482,8 @@ static void normalize_vcf(args_t *args)
}
if ( hts_close(args->out)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname);
- fprintf(bcftools_stderr,"Lines total/split/joined/realigned/removed/skipped:\t%d/%d/%d/%d/%d/%d\n",
- args->ntotal,args->nsplit,args->njoined,args->nchanged,args->nskipped,args->nfilter);
+ fprintf(bcftools_stderr,"Lines total/split/joined/realigned/mismatch_removed/dup_removed/skipped:\t%d/%d/%d/%d/%d/%d/%d\n",
+ args->ntotal,args->nsplit,args->njoined,args->nchanged,args->nskipped,args->nrmdup,args->nfilter);
if ( args->check_ref & CHECK_REF_FIX )
fprintf(bcftools_stderr,"REF/ALT total/modified/added: \t%d/%d/%d\n", args->nref.tot,args->nref.swap,args->nref.set);
}
@@ -2469,7 +2524,7 @@ static void usage(void)
fprintf(bcftools_stderr, " -T, --targets-file FILE Similar to -R but streams rather than index-jumps\n");
fprintf(bcftools_stderr, " --targets-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n");
fprintf(bcftools_stderr, " --threads INT Use multithreading with INT worker threads [0]\n");
- fprintf(bcftools_stderr, " -v, --verbose INT Verbosity level (0-2) of GFF parsing [1]\n");
+ fprintf(bcftools_stderr, " -v, --verbosity INT Verbosity level\n");
fprintf(bcftools_stderr, " -w, --site-win INT Buffer for sorting lines which changed position during realignment [1000]\n");
fprintf(bcftools_stderr, " -W, --write-index[=FMT] Automatically index the output files [off]\n");
fprintf(bcftools_stderr, "\n");
@@ -2502,7 +2557,7 @@ int main_vcfnorm(int argc, char *argv[])
int region_is_file = 0;
int targets_is_file = 0;
args->use_star_allele = 1;
- args->gff_verbosity = 1;
+ args->verbose = 1;
int regions_overlap = 1;
int targets_overlap = 0;
args->cmp_func = cmp_bcf_pos;
@@ -2541,6 +2596,7 @@ int main_vcfnorm(int argc, char *argv[])
{"no-version",no_argument,NULL,8},
{"write-index",optional_argument,NULL,'W'},
{"verbose",required_argument,NULL,'v'},
+ {"verbosity",required_argument,NULL,'v'},
{NULL,0,NULL,0}
};
char *tmp;
@@ -2554,8 +2610,9 @@ int main_vcfnorm(int argc, char *argv[])
break;
case 'g': args->gff_fname = optarg; break;
case 'v':
- args->gff_verbosity = atoi(optarg);
- if ( args->gff_verbosity<0 || args->gff_verbosity>2 ) error("Error: expected integer 0-2 with -v, --verbose\n");
+ args->verbose = strtol(optarg,&tmp,10);
+ if ( *tmp || args->verbose<0 ) error("Could not parse argument: --verbosity %s\n", optarg);
+ if ( args->verbose > 3 ) hts_verbose = args->verbose;
break;
case 'a': args->atomize = SPLIT; break;
case 'e':
@@ -2635,7 +2692,7 @@ int main_vcfnorm(int argc, char *argv[])
break;
case 'o': args->output_fname = optarg; break;
case 'D':
- fprintf(bcftools_stderr,"Warning: `-D` is functional but deprecated, replaced by and alias of `-d none`.\n");
+ fprintf(bcftools_stderr,"Warning: `-D` is functional but deprecated, replaced by and alias of `-d exact`.\n");
args->rmdup = BCF_SR_PAIR_EXACT;
break;
case 's': args->strict_filter = 1; break;
diff --git a/bcftools/vcfplugin.c b/bcftools/vcfplugin.c
index 4ee99ee1..b3cbcbbf 100644
--- a/bcftools/vcfplugin.c
+++ b/bcftools/vcfplugin.c
@@ -1,6 +1,6 @@
/* vcfplugin.c -- plugin modules for operating on VCF/BCF files.
- Copyright (C) 2013-2023 Genome Research Ltd.
+ Copyright (C) 2013-2025 Genome Research Ltd.
Author: Petr Danecek
@@ -613,7 +613,7 @@ static void usage(args_t *args)
fprintf(stderr, "Plugin options:\n");
fprintf(stderr, " -h, --help List plugin's options\n");
fprintf(stderr, " -l, --list-plugins List available plugins. See BCFTOOLS_PLUGINS environment variable and man page for details\n");
- fprintf(stderr, " -v, --verbose Print verbose information, -vv increases verbosity\n");
+ fprintf(stderr, " -v, --verbosity INT Verbosity level\n");
fprintf(stderr, " -V, --version Print version string and exit\n");
fprintf(stderr, " -W, --write-index[=FMT] Automatically index the output files [off]\n");
fprintf(stderr, "\n");
@@ -678,7 +678,8 @@ int main_plugin(int argc, char *argv[])
static struct option loptions[] =
{
{"version",no_argument,NULL,'V'},
- {"verbose",no_argument,NULL,'v'},
+ {"verbose",optional_argument,NULL,'v'},
+ {"verbosity",optional_argument,NULL,'v'},
{"help",no_argument,NULL,'h'},
{"list-plugins",no_argument,NULL,'l'},
{"output",required_argument,NULL,'o'},
@@ -697,11 +698,18 @@ int main_plugin(int argc, char *argv[])
{NULL,0,NULL,0}
};
char *tmp;
- while ((c = getopt_long(argc, argv, "h?o:O:r:R:t:T:li:e:vVW::",loptions,NULL)) >= 0)
+ while ((c = getopt_long(argc, argv, "h?o:O:r:R:t:T:li:e:v::VW::",loptions,NULL)) >= 0)
{
switch (c) {
case 'V': version_only = 1; break;
- case 'v': args->verbose++; break;
+ case 'v':
+ if ( !optarg ) args->verbose++;
+ else
+ {
+ args->verbose = strtol(optarg,&tmp,10);
+ if ( *tmp || args->verbose<0 ) error("Could not parse argument: --verbosity %s\n", optarg);
+ if ( args->verbose > 3 ) hts_verbose = args->verbose;
+ }
case 'o': args->output_fname = optarg; break;
case 'O':
switch (optarg[0]) {
diff --git a/bcftools/vcfplugin.c.pysam.c b/bcftools/vcfplugin.c.pysam.c
index f19bc963..024438a9 100644
--- a/bcftools/vcfplugin.c.pysam.c
+++ b/bcftools/vcfplugin.c.pysam.c
@@ -2,7 +2,7 @@
/* vcfplugin.c -- plugin modules for operating on VCF/BCF files.
- Copyright (C) 2013-2023 Genome Research Ltd.
+ Copyright (C) 2013-2025 Genome Research Ltd.
Author: Petr Danecek
@@ -615,7 +615,7 @@ static void usage(args_t *args)
fprintf(bcftools_stderr, "Plugin options:\n");
fprintf(bcftools_stderr, " -h, --help List plugin's options\n");
fprintf(bcftools_stderr, " -l, --list-plugins List available plugins. See BCFTOOLS_PLUGINS environment variable and man page for details\n");
- fprintf(bcftools_stderr, " -v, --verbose Print verbose information, -vv increases verbosity\n");
+ fprintf(bcftools_stderr, " -v, --verbosity INT Verbosity level\n");
fprintf(bcftools_stderr, " -V, --version Print version string and exit\n");
fprintf(bcftools_stderr, " -W, --write-index[=FMT] Automatically index the output files [off]\n");
fprintf(bcftools_stderr, "\n");
@@ -680,7 +680,8 @@ int main_plugin(int argc, char *argv[])
static struct option loptions[] =
{
{"version",no_argument,NULL,'V'},
- {"verbose",no_argument,NULL,'v'},
+ {"verbose",optional_argument,NULL,'v'},
+ {"verbosity",optional_argument,NULL,'v'},
{"help",no_argument,NULL,'h'},
{"list-plugins",no_argument,NULL,'l'},
{"output",required_argument,NULL,'o'},
@@ -699,11 +700,18 @@ int main_plugin(int argc, char *argv[])
{NULL,0,NULL,0}
};
char *tmp;
- while ((c = getopt_long(argc, argv, "h?o:O:r:R:t:T:li:e:vVW::",loptions,NULL)) >= 0)
+ while ((c = getopt_long(argc, argv, "h?o:O:r:R:t:T:li:e:v::VW::",loptions,NULL)) >= 0)
{
switch (c) {
case 'V': version_only = 1; break;
- case 'v': args->verbose++; break;
+ case 'v':
+ if ( !optarg ) args->verbose++;
+ else
+ {
+ args->verbose = strtol(optarg,&tmp,10);
+ if ( *tmp || args->verbose<0 ) error("Could not parse argument: --verbosity %s\n", optarg);
+ if ( args->verbose > 3 ) hts_verbose = args->verbose;
+ }
case 'o': args->output_fname = optarg; break;
case 'O':
switch (optarg[0]) {
diff --git a/bcftools/vcfquery.c b/bcftools/vcfquery.c
index 7b1dd439..eefb2a2d 100644
--- a/bcftools/vcfquery.c
+++ b/bcftools/vcfquery.c
@@ -1,6 +1,6 @@
/* vcfquery.c -- Extracts fields from VCF/BCF file.
- Copyright (C) 2013-2023 Genome Research Ltd.
+ Copyright (C) 2013-2025 Genome Research Ltd.
Author: Petr Danecek
@@ -254,6 +254,7 @@ static void usage(void)
fprintf(stderr, " --targets-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n");
fprintf(stderr, " -u, --allow-undef-tags Print \".\" for undefined tags\n");
fprintf(stderr, " -v, --vcf-list FILE Process multiple VCFs listed in the file\n");
+ fprintf(stderr, " --verbosity INT Verbosity level\n");
fprintf(stderr, "\n");
fprintf(stderr, "Examples:\n");
fprintf(stderr, "\tbcftools query -f '%%CHROM\\t%%POS\\t%%REF\\t%%ALT[\\t%%SAMPLE=%%GT]\\n' file.vcf.gz\n");
@@ -297,6 +298,7 @@ int main_vcfquery(int argc, char *argv[])
{"collapse",1,0,'c'},
{"vcf-list",1,0,'v'},
{"allow-undef-tags",0,0,'u'},
+ {"verbosity",required_argument,NULL,4},
{0,0,0,0}
};
while ((c = getopt_long(argc, argv, "hlr:R:F:f:a:s:S:Ht:T:c:v:i:e:o:uN",loptions,NULL)) >= 0) {
@@ -350,6 +352,9 @@ int main_vcfquery(int argc, char *argv[])
if ( targets_overlap < 0 ) error("Could not parse: --targets-overlap %s\n",optarg);
break;
case 3 : args->force_samples = 1; break;
+ case 4 :
+ if ( apply_verbosity(optarg) < 0 ) error("Could not parse argument: --verbosity %s\n", optarg);
+ break;
case 'h':
case '?': usage(); break;
default: error("Unknown argument: %s\n", optarg);
diff --git a/bcftools/vcfquery.c.pysam.c b/bcftools/vcfquery.c.pysam.c
index 407d2562..818e8d42 100644
--- a/bcftools/vcfquery.c.pysam.c
+++ b/bcftools/vcfquery.c.pysam.c
@@ -2,7 +2,7 @@
/* vcfquery.c -- Extracts fields from VCF/BCF file.
- Copyright (C) 2013-2023 Genome Research Ltd.
+ Copyright (C) 2013-2025 Genome Research Ltd.
Author: Petr Danecek
@@ -256,6 +256,7 @@ static void usage(void)
fprintf(bcftools_stderr, " --targets-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n");
fprintf(bcftools_stderr, " -u, --allow-undef-tags Print \".\" for undefined tags\n");
fprintf(bcftools_stderr, " -v, --vcf-list FILE Process multiple VCFs listed in the file\n");
+ fprintf(bcftools_stderr, " --verbosity INT Verbosity level\n");
fprintf(bcftools_stderr, "\n");
fprintf(bcftools_stderr, "Examples:\n");
fprintf(bcftools_stderr, "\tbcftools query -f '%%CHROM\\t%%POS\\t%%REF\\t%%ALT[\\t%%SAMPLE=%%GT]\\n' file.vcf.gz\n");
@@ -299,6 +300,7 @@ int main_vcfquery(int argc, char *argv[])
{"collapse",1,0,'c'},
{"vcf-list",1,0,'v'},
{"allow-undef-tags",0,0,'u'},
+ {"verbosity",required_argument,NULL,4},
{0,0,0,0}
};
while ((c = getopt_long(argc, argv, "hlr:R:F:f:a:s:S:Ht:T:c:v:i:e:o:uN",loptions,NULL)) >= 0) {
@@ -352,6 +354,9 @@ int main_vcfquery(int argc, char *argv[])
if ( targets_overlap < 0 ) error("Could not parse: --targets-overlap %s\n",optarg);
break;
case 3 : args->force_samples = 1; break;
+ case 4 :
+ if ( apply_verbosity(optarg) < 0 ) error("Could not parse argument: --verbosity %s\n", optarg);
+ break;
case 'h':
case '?': usage(); break;
default: error("Unknown argument: %s\n", optarg);
diff --git a/bcftools/vcfroh.c b/bcftools/vcfroh.c
index f1d1c86e..1b3eff91 100644
--- a/bcftools/vcfroh.c
+++ b/bcftools/vcfroh.c
@@ -1,6 +1,6 @@
/* vcfroh.c -- HMM model for detecting runs of autozygosity.
- Copyright (C) 2013-2022 Genome Research Ltd.
+ Copyright (C) 2013-2025 Genome Research Ltd.
Author: Petr Danecek
@@ -1076,40 +1076,48 @@ static void usage(args_t *args)
fprintf(stderr, "Usage: bcftools roh [options] \n");
fprintf(stderr, "\n");
fprintf(stderr, "General Options:\n");
- fprintf(stderr, " --AF-dflt if AF is not known, use this allele frequency [skip]\n");
- fprintf(stderr, " --AF-tag use TAG for allele frequency\n");
- fprintf(stderr, " --AF-file