diff --git a/README.rst b/README.rst index 68f2fd55..9fa28cd3 100644 --- a/README.rst +++ b/README.rst @@ -25,7 +25,7 @@ as it resolves non-python dependencies and uses pre-configured compilation options. Especially for OS X this will potentially save a lot of trouble. -The current version of pysam wraps 3rd-party code from htslib-1.21, samtools-1.21, and bcftools-1.21. +The current version of pysam wraps 3rd-party code from htslib-1.22, samtools-1.22, and bcftools-1.22. Pysam is available through `PyPI `_. To install, type:: diff --git a/bcftools/bcftools.h b/bcftools/bcftools.h index 51c2d040..5a4071d9 100644 --- a/bcftools/bcftools.h +++ b/bcftools/bcftools.h @@ -50,6 +50,9 @@ void error(const char *format, ...) HTS_NORETURN HTS_FORMAT(HTS_PRINTF_FMT, 1, 2 // newline will be added by the function. void error_errno(const char *format, ...) HTS_NORETURN HTS_FORMAT(HTS_PRINTF_FMT, 1, 2); +// Set hts_verbose and return 0, or return -1 if str is not a valid integer +int apply_verbosity(const char *str); + // For on the fly index creation with --write-index int init_index2(htsFile *fh, bcf_hdr_t *hdr, const char *fname, char **idx_fname, int idx_fmt); int init_index(htsFile *fh, bcf_hdr_t *hdr, const char *fname, char **idx_fname); diff --git a/bcftools/consensus.c b/bcftools/consensus.c index 54f17c22..c3344206 100644 --- a/bcftools/consensus.c +++ b/bcftools/consensus.c @@ -1,6 +1,6 @@ /* The MIT License - Copyright (c) 2014-2024 Genome Research Ltd. + Copyright (c) 2014-2025 Genome Research Ltd. Author: Petr Danecek @@ -228,24 +228,24 @@ static void init_data(args_t *args) if ( !bcf_sr_add_reader(args->files,args->fname) ) error("Failed to read from %s: %s\n", !strcmp("-",args->fname)?"standard input":args->fname, bcf_sr_strerror(args->files->errnum)); args->hdr = args->files->readers[0].header; args->isample = -1; - if ( !args->sample ) + if ( args->sample_fname ) { - args->smpl = smpl_ilist_init(args->hdr,NULL,0,SMPL_NONE|SMPL_VERBOSE); - if ( !args->smpl->n ) - { - smpl_ilist_destroy(args->smpl); - args->smpl = NULL; - } + args->smpl = smpl_ilist_init(args->hdr,args->sample_fname,1,SMPL_NONE|SMPL_VERBOSE); + if ( args->smpl && !args->smpl->n ) error("No matching sample found\n"); } else if ( args->sample && strcmp("-",args->sample) ) { args->smpl = smpl_ilist_init(args->hdr,args->sample,0,SMPL_NONE|SMPL_VERBOSE); if ( args->smpl && !args->smpl->n ) error("No matching sample found\n"); } - else if ( args->sample_fname ) + else if ( !args->sample ) { - args->smpl = smpl_ilist_init(args->hdr,args->sample_fname,1,SMPL_NONE|SMPL_VERBOSE); - if ( args->smpl && !args->smpl->n ) error("No matching sample found\n"); + args->smpl = smpl_ilist_init(args->hdr,NULL,0,SMPL_NONE|SMPL_VERBOSE); + if ( !args->smpl->n ) + { + smpl_ilist_destroy(args->smpl); + args->smpl = NULL; + } } if ( args->smpl ) { @@ -768,12 +768,26 @@ static void apply_variant(args_t *args, bcf1_t *rec) } if ( ialt==-1 ) { - char alleles[4]; - alleles[0] = rec->d.allele[0][0]; - alleles[1] = ','; - alleles[2] = args->missing_allele; - alleles[3] = 0; - bcf_update_alleles_str(args->hdr, rec, alleles); + // missing allele, it can be a single position or an entire gvcf block + if ( rec->rlen>1 && bcf_has_variant_types(rec,VCF_REF,bcf_match_exact)>0 ) + { + kstring_t str = {0,0,0}; + int idx = rec->pos - args->fa_ori_pos + args->fa_mod_off; // position of the variant within the modified fasta sequence + kputsn(args->fa_buf.s+idx,rec->rlen, &str); + kputc(',', &str); + for (i=0; irlen; i++) kputc(args->missing_allele, &str); + bcf_update_alleles_str(args->hdr, rec, str.s); + free(str.s); + } + else + { + char alleles[4]; + alleles[0] = rec->d.allele[0][0]; + alleles[1] = ','; + alleles[2] = args->missing_allele; + alleles[3] = 0; + bcf_update_alleles_str(args->hdr, rec, alleles); + } ialt = 1; } @@ -1203,6 +1217,7 @@ static void usage(args_t *args) fprintf(stderr, " --regions-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1]\n"); fprintf(stderr, " -s, --samples LIST Comma-separated list of samples to include, \"-\" to ignore samples and use REF,ALT\n"); fprintf(stderr, " -S, --samples-file FILE File of samples to include\n"); + fprintf(stderr, " -v, --verbosity INT Verbosity level\n"); fprintf(stderr, "Examples:\n"); fprintf(stderr, " # Get the consensus for one region. The fasta header lines are then expected\n"); fprintf(stderr, " # in the form \">chr:from-to\".\n"); @@ -1240,13 +1255,17 @@ int main_consensus(int argc, char *argv[]) {"chain",1,0,'c'}, {"prefix",required_argument,0,'p'}, {"regions-overlap",required_argument,0,5}, + {"verbosity",required_argument,NULL,'v'}, {0,0,0,0} }; int c; - while ((c = getopt_long(argc, argv, "h?s:S:1Ii:e:H:f:o:m:c:M:p:a:",loptions,NULL)) >= 0) + while ((c = getopt_long(argc, argv, "h?s:S:1Ii:e:H:f:o:m:c:M:p:a:v:",loptions,NULL)) >= 0) { switch (c) { + case 'v': + if ( apply_verbosity(optarg) < 0 ) error("Could not parse argument: --verbosity %s\n", optarg); + break; case 1 : args->mark_del = optarg[0]; break; case 2 : if ( !strcasecmp(optarg,"uc") ) args->mark_ins = TO_UPPER; diff --git a/bcftools/consensus.c.pysam.c b/bcftools/consensus.c.pysam.c index a004f004..8b3eb4b1 100644 --- a/bcftools/consensus.c.pysam.c +++ b/bcftools/consensus.c.pysam.c @@ -2,7 +2,7 @@ /* The MIT License - Copyright (c) 2014-2024 Genome Research Ltd. + Copyright (c) 2014-2025 Genome Research Ltd. Author: Petr Danecek @@ -230,24 +230,24 @@ static void init_data(args_t *args) if ( !bcf_sr_add_reader(args->files,args->fname) ) error("Failed to read from %s: %s\n", !strcmp("-",args->fname)?"standard input":args->fname, bcf_sr_strerror(args->files->errnum)); args->hdr = args->files->readers[0].header; args->isample = -1; - if ( !args->sample ) + if ( args->sample_fname ) { - args->smpl = smpl_ilist_init(args->hdr,NULL,0,SMPL_NONE|SMPL_VERBOSE); - if ( !args->smpl->n ) - { - smpl_ilist_destroy(args->smpl); - args->smpl = NULL; - } + args->smpl = smpl_ilist_init(args->hdr,args->sample_fname,1,SMPL_NONE|SMPL_VERBOSE); + if ( args->smpl && !args->smpl->n ) error("No matching sample found\n"); } else if ( args->sample && strcmp("-",args->sample) ) { args->smpl = smpl_ilist_init(args->hdr,args->sample,0,SMPL_NONE|SMPL_VERBOSE); if ( args->smpl && !args->smpl->n ) error("No matching sample found\n"); } - else if ( args->sample_fname ) + else if ( !args->sample ) { - args->smpl = smpl_ilist_init(args->hdr,args->sample_fname,1,SMPL_NONE|SMPL_VERBOSE); - if ( args->smpl && !args->smpl->n ) error("No matching sample found\n"); + args->smpl = smpl_ilist_init(args->hdr,NULL,0,SMPL_NONE|SMPL_VERBOSE); + if ( !args->smpl->n ) + { + smpl_ilist_destroy(args->smpl); + args->smpl = NULL; + } } if ( args->smpl ) { @@ -770,12 +770,26 @@ static void apply_variant(args_t *args, bcf1_t *rec) } if ( ialt==-1 ) { - char alleles[4]; - alleles[0] = rec->d.allele[0][0]; - alleles[1] = ','; - alleles[2] = args->missing_allele; - alleles[3] = 0; - bcf_update_alleles_str(args->hdr, rec, alleles); + // missing allele, it can be a single position or an entire gvcf block + if ( rec->rlen>1 && bcf_has_variant_types(rec,VCF_REF,bcf_match_exact)>0 ) + { + kstring_t str = {0,0,0}; + int idx = rec->pos - args->fa_ori_pos + args->fa_mod_off; // position of the variant within the modified fasta sequence + kputsn(args->fa_buf.s+idx,rec->rlen, &str); + kputc(',', &str); + for (i=0; irlen; i++) kputc(args->missing_allele, &str); + bcf_update_alleles_str(args->hdr, rec, str.s); + free(str.s); + } + else + { + char alleles[4]; + alleles[0] = rec->d.allele[0][0]; + alleles[1] = ','; + alleles[2] = args->missing_allele; + alleles[3] = 0; + bcf_update_alleles_str(args->hdr, rec, alleles); + } ialt = 1; } @@ -1205,6 +1219,7 @@ static void usage(args_t *args) fprintf(bcftools_stderr, " --regions-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1]\n"); fprintf(bcftools_stderr, " -s, --samples LIST Comma-separated list of samples to include, \"-\" to ignore samples and use REF,ALT\n"); fprintf(bcftools_stderr, " -S, --samples-file FILE File of samples to include\n"); + fprintf(bcftools_stderr, " -v, --verbosity INT Verbosity level\n"); fprintf(bcftools_stderr, "Examples:\n"); fprintf(bcftools_stderr, " # Get the consensus for one region. The fasta header lines are then expected\n"); fprintf(bcftools_stderr, " # in the form \">chr:from-to\".\n"); @@ -1242,13 +1257,17 @@ int main_consensus(int argc, char *argv[]) {"chain",1,0,'c'}, {"prefix",required_argument,0,'p'}, {"regions-overlap",required_argument,0,5}, + {"verbosity",required_argument,NULL,'v'}, {0,0,0,0} }; int c; - while ((c = getopt_long(argc, argv, "h?s:S:1Ii:e:H:f:o:m:c:M:p:a:",loptions,NULL)) >= 0) + while ((c = getopt_long(argc, argv, "h?s:S:1Ii:e:H:f:o:m:c:M:p:a:v:",loptions,NULL)) >= 0) { switch (c) { + case 'v': + if ( apply_verbosity(optarg) < 0 ) error("Could not parse argument: --verbosity %s\n", optarg); + break; case 1 : args->mark_del = optarg[0]; break; case 2 : if ( !strcasecmp(optarg,"uc") ) args->mark_ins = TO_UPPER; diff --git a/bcftools/convert.c b/bcftools/convert.c index c459c838..5ab39562 100644 --- a/bcftools/convert.c +++ b/bcftools/convert.c @@ -1,6 +1,6 @@ /* convert.c -- functions for converting between VCF/BCF and related formats. - Copyright (C) 2013-2024 Genome Research Ltd. + Copyright (C) 2013-2025 Genome Research Ltd. Author: Petr Danecek @@ -28,6 +28,7 @@ THE SOFTWARE. */ #include #include #include +#include #include #include #include @@ -79,6 +80,7 @@ THE SOFTWARE. */ #define T_VKX 31 // VARIANTKEY HEX #define T_PBINOM 32 #define T_NPASS 33 +#define T_FILTER_EXPR 34 // print the results of -i/-e functions via query typedef struct _fmt_t { @@ -123,6 +125,16 @@ typedef struct } bcsq_t; +typedef struct +{ + filter_t *filter; + int nval; + double *val; +} +filter_expr_t; + +static fmt_t *register_tag(convert_t *convert, char *key, int is_gtf, int type); + static void process_chrom(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str) { kputs(convert->header->id[BCF_DT_CTG][line->rid].key, str); } static void process_pos(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str) { kputw(line->pos+1, str); } static void process_pos0(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str) { kputw(line->pos, str); } @@ -1157,6 +1169,50 @@ static void destroy_npass(void *usr) { filter_destroy((filter_t*)usr); } +static void process_filter_expr(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str) +{ + filter_expr_t *dat = (filter_expr_t*) fmt->usr; + + int i, nval, nval1; + const double *val; + if ( fmt->is_gt_field ) + { + if ( !fmt->ready ) + { + filter_test(dat->filter,line,NULL); + val = filter_get_doubles(dat->filter,&nval,&nval1); + if ( fmt->is_gt_field ) + { + if ( !dat->nval ) + { + dat->nval = nval; + dat->val = malloc(nval*sizeof(double)); + if ( !dat->val ) error("Error: failed to allocate %zu bytes\n",nval*sizeof(double)); + } + assert( dat->nval==nval ); + for (i=0; ival[i] = val[i]; + } + fmt->ready = 1; + } + val = dat->val; + nval = dat->nval; + } + else + { + filter_test(dat->filter,line,NULL); + val = filter_get_doubles(dat->filter,&nval,&nval1); + } + if ( isample<0 ) isample = 0; + if ( isample>=nval ) isample = 0; + kputd(val[isample], str); +} +static void destroy_filter_expr(void *usr) +{ + filter_expr_t *dat = (filter_expr_t*) usr; + filter_destroy(dat->filter); + free(dat->val); + free(dat); +} static void process_pbinom(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str) { @@ -1249,6 +1305,48 @@ static void _used_tags_add(convert_t *convert, int type, char *key) else if ( !strcmp("MASK",key) ) { function(__VA_ARGS__, T_MASK); } \ else if ( !strcmp("LINE",key) ) { function(__VA_ARGS__, T_LINE); } +// This invokes the functionality of -i/-e expressions +static char *set_filter_expr(convert_t *convert, char *key, int is_gtf) +{ + kstring_t str = {0,0,0}; + char *ptr = key; + while ( *ptr && *ptr!=')' ) ptr++; + if ( !*ptr ) error("Could not parse format string: %s\n",convert->format_str); + kputsn(key, ptr-key+1, &str); + register_tag(convert, str.s, is_gtf, T_FILTER_EXPR); + free(str.s); + return key+str.l; +} + +// These are the -i/-e functions made to be printed via `query -f` +#define _SET_FILTER_EXPR(convert,function,key,ptr,is_gtf) \ + if ( !strncasecmp(key,"MAX(",4) ) { ptr = function(convert,key,is_gtf); } \ + else if ( !strncasecmp(key,"MIN(",4) ) { ptr = function(convert,key,is_gtf); } \ + else if ( !strncasecmp(key,"MEAN(",5) ) { ptr = function(convert,key,is_gtf); } \ + else if ( !strncasecmp(key,"MEDIAN(",7) ) { ptr = function(convert,key,is_gtf); } \ + else if ( !strncasecmp(key,"AVG(",4) ) { ptr = function(convert,key,is_gtf); } \ + else if ( !strncasecmp(key,"SUM(",4) ) { ptr = function(convert,key,is_gtf); } \ + else if ( !strncasecmp(key,"ABS(",4) ) { ptr = function(convert,key,is_gtf); } \ + else if ( !strncasecmp(key,"COUNT(",6) ) { ptr = function(convert,key,is_gtf); } \ + else if ( !strncasecmp(key,"STDEV(",6) ) { ptr = function(convert,key,is_gtf); } \ + else if ( !strncasecmp(key,"STRLEN(",7) ) { ptr = function(convert,key,is_gtf); } \ + else if ( !strncasecmp(key,"BINOM(",6) ) { ptr = function(convert,key,is_gtf); } \ + else if ( !strncasecmp(key,"PHRED(",6) ) { ptr = function(convert,key,is_gtf); } \ + else if ( !strncasecmp(key,"SMPL_MAX(",9) ) { ptr = function(convert,key,is_gtf); } \ + else if ( !strncasecmp(key,"SMPL_MIN(",9) ) { ptr = function(convert,key,is_gtf); } \ + else if ( !strncasecmp(key,"SMPL_MEAN(",10) ) { ptr = function(convert,key,is_gtf); } \ + else if ( !strncasecmp(key,"SMPL_MEDIAN(",12) ) { ptr = function(convert,key,is_gtf); } \ + else if ( !strncasecmp(key,"SMPL_AVG(",9) ) { ptr = function(convert,key,is_gtf); } \ + else if ( !strncasecmp(key,"SMPL_STDEV(",11) ) { ptr = function(convert,key,is_gtf); } \ + else if ( !strncasecmp(key,"SMPL_SUM(",9) ) { ptr = function(convert,key,is_gtf); } \ + else if ( !strncasecmp(key,"sMAX(",5) ) { ptr = function(convert,key,is_gtf); } \ + else if ( !strncasecmp(key,"sMIN(",5) ) { ptr = function(convert,key,is_gtf); } \ + else if ( !strncasecmp(key,"sMEAN(",6) ) { ptr = function(convert,key,is_gtf); } \ + else if ( !strncasecmp(key,"sMEDIAN(",8) ) { ptr = function(convert,key,is_gtf); } \ + else if ( !strncasecmp(key,"sAVG(",5) ) { ptr = function(convert,key,is_gtf); } \ + else if ( !strncasecmp(key,"sSTDEV(",7) ) { ptr = function(convert,key,is_gtf); } \ + else if ( !strncasecmp(key,"sSUM(",5) ) { ptr = function(convert,key,is_gtf); } + static void set_type(fmt_t *fmt, int type) { fmt->type = type; } static fmt_t *register_tag(convert_t *convert, char *key, int is_gtf, int type) { @@ -1273,8 +1371,8 @@ static fmt_t *register_tag(convert_t *convert, char *key, int is_gtf, int type) if ( fmt->type==T_FORMAT && !bcf_hdr_idinfo_exists(convert->header,BCF_HL_FMT,id) ) { _SET_NON_FORMAT_TAGS(set_type,key,fmt) - else if ( !strcmp("ALT",key) ) { fmt->type = T_ALT; } - else if ( !strcmp("_CHROM_POS_ID",key) ) { fmt->type = T_CHROM_POS_ID; } + else if ( !strcmp("ALT",key) ) { fmt->type = T_ALT; } + else if ( !strcmp("_CHROM_POS_ID",key) ) { fmt->type = T_CHROM_POS_ID; } else if ( !strcmp("RSX",key) ) { fmt->type = T_RSX; } else if ( !strcmp("VKX",key) ) { fmt->type = T_VKX; } else if ( id>=0 && bcf_hdr_idinfo_exists(convert->header,BCF_HL_INFO,id) ) @@ -1295,6 +1393,14 @@ static fmt_t *register_tag(convert_t *convert, char *key, int is_gtf, int type) convert->max_unpack |= filter_max_unpack(flt); fmt->usr = (void*) flt; } + else if ( fmt->type==T_FILTER_EXPR ) + { + filter_t *filter = filter_init(convert->header,key); + convert->max_unpack |= filter_max_unpack(filter); + filter_expr_t *dat = calloc(1,sizeof(filter_expr_t)); + fmt->usr = dat; + dat->filter = filter; + } } switch (fmt->type) @@ -1332,6 +1438,7 @@ static fmt_t *register_tag(convert_t *convert, char *key, int is_gtf, int type) case T_VKX: fmt->handler = &process_variantkey_hex; break; case T_PBINOM: fmt->handler = &process_pbinom; convert->max_unpack |= BCF_UN_FMT; break; case T_NPASS: fmt->handler = &process_npass; fmt->destroy = &destroy_npass; break; + case T_FILTER_EXPR: fmt->handler = &process_filter_expr; fmt->destroy = &destroy_filter_expr; break; default: error("TODO: handler for type %d\n", fmt->type); } if ( key && fmt->type==T_INFO ) @@ -1360,14 +1467,28 @@ static int parse_subscript(char **p) static char *parse_tag(convert_t *convert, char *p, int is_gtf) { + int is_vcf_column = p[1]=='/' ? 1 : 0; + if ( is_vcf_column ) p++; + char *q = ++p; while ( *q && (isalnum(*q) || *q=='_' || *q=='.') ) q++; kstring_t str = {0,0,0}; if ( q-p==0 ) error("Could not parse format string: %s\n", convert->format_str); kputsn(p, q-p, &str); - if ( is_gtf ) + if ( is_gtf && is_vcf_column ) + { + _SET_NON_FORMAT_TAGS(register_tag, str.s, convert, str.s, is_gtf) + else if ( !strcmp(str.s, "ALT") ) + { + fmt_t *fmt = register_tag(convert, str.s, is_gtf, T_ALT); + fmt->subscript = parse_subscript(&q); + } + else error("Could not parse tag: %s .. %s\n", str.s,convert->format_str); + } + else if ( is_gtf ) { - if ( !strcmp(str.s, "SAMPLE") ) register_tag(convert, "SAMPLE", is_gtf, T_SAMPLE); + _SET_FILTER_EXPR(convert,set_filter_expr,p,q,1) + else if ( !strcmp(str.s, "SAMPLE") ) register_tag(convert, "SAMPLE", is_gtf, T_SAMPLE); else if ( !strcmp(str.s, "GT") ) register_tag(convert, "GT", is_gtf, T_GT); else if ( !strcmp(str.s, "TGT") ) register_tag(convert, "GT", is_gtf, T_TGT); else if ( !strcmp(str.s, "TBCSQ") ) @@ -1422,6 +1543,7 @@ static char *parse_tag(convert_t *convert, char *p, int is_gtf) else { _SET_NON_FORMAT_TAGS(register_tag, str.s, convert, str.s, is_gtf) + else _SET_FILTER_EXPR(convert,set_filter_expr,p,q,0) else if ( !strcmp(str.s, "ALT") ) { fmt_t *fmt = register_tag(convert, str.s, is_gtf, T_ALT); diff --git a/bcftools/convert.c.pysam.c b/bcftools/convert.c.pysam.c index e7d2905c..0b23f371 100644 --- a/bcftools/convert.c.pysam.c +++ b/bcftools/convert.c.pysam.c @@ -2,7 +2,7 @@ /* convert.c -- functions for converting between VCF/BCF and related formats. - Copyright (C) 2013-2024 Genome Research Ltd. + Copyright (C) 2013-2025 Genome Research Ltd. Author: Petr Danecek @@ -30,6 +30,7 @@ THE SOFTWARE. */ #include #include #include +#include #include #include #include @@ -81,6 +82,7 @@ THE SOFTWARE. */ #define T_VKX 31 // VARIANTKEY HEX #define T_PBINOM 32 #define T_NPASS 33 +#define T_FILTER_EXPR 34 // print the results of -i/-e functions via query typedef struct _fmt_t { @@ -125,6 +127,16 @@ typedef struct } bcsq_t; +typedef struct +{ + filter_t *filter; + int nval; + double *val; +} +filter_expr_t; + +static fmt_t *register_tag(convert_t *convert, char *key, int is_gtf, int type); + static void process_chrom(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str) { kputs(convert->header->id[BCF_DT_CTG][line->rid].key, str); } static void process_pos(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str) { kputw(line->pos+1, str); } static void process_pos0(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str) { kputw(line->pos, str); } @@ -1159,6 +1171,50 @@ static void destroy_npass(void *usr) { filter_destroy((filter_t*)usr); } +static void process_filter_expr(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str) +{ + filter_expr_t *dat = (filter_expr_t*) fmt->usr; + + int i, nval, nval1; + const double *val; + if ( fmt->is_gt_field ) + { + if ( !fmt->ready ) + { + filter_test(dat->filter,line,NULL); + val = filter_get_doubles(dat->filter,&nval,&nval1); + if ( fmt->is_gt_field ) + { + if ( !dat->nval ) + { + dat->nval = nval; + dat->val = malloc(nval*sizeof(double)); + if ( !dat->val ) error("Error: failed to allocate %zu bytes\n",nval*sizeof(double)); + } + assert( dat->nval==nval ); + for (i=0; ival[i] = val[i]; + } + fmt->ready = 1; + } + val = dat->val; + nval = dat->nval; + } + else + { + filter_test(dat->filter,line,NULL); + val = filter_get_doubles(dat->filter,&nval,&nval1); + } + if ( isample<0 ) isample = 0; + if ( isample>=nval ) isample = 0; + kputd(val[isample], str); +} +static void destroy_filter_expr(void *usr) +{ + filter_expr_t *dat = (filter_expr_t*) usr; + filter_destroy(dat->filter); + free(dat->val); + free(dat); +} static void process_pbinom(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str) { @@ -1251,6 +1307,48 @@ static void _used_tags_add(convert_t *convert, int type, char *key) else if ( !strcmp("MASK",key) ) { function(__VA_ARGS__, T_MASK); } \ else if ( !strcmp("LINE",key) ) { function(__VA_ARGS__, T_LINE); } +// This invokes the functionality of -i/-e expressions +static char *set_filter_expr(convert_t *convert, char *key, int is_gtf) +{ + kstring_t str = {0,0,0}; + char *ptr = key; + while ( *ptr && *ptr!=')' ) ptr++; + if ( !*ptr ) error("Could not parse format string: %s\n",convert->format_str); + kputsn(key, ptr-key+1, &str); + register_tag(convert, str.s, is_gtf, T_FILTER_EXPR); + free(str.s); + return key+str.l; +} + +// These are the -i/-e functions made to be printed via `query -f` +#define _SET_FILTER_EXPR(convert,function,key,ptr,is_gtf) \ + if ( !strncasecmp(key,"MAX(",4) ) { ptr = function(convert,key,is_gtf); } \ + else if ( !strncasecmp(key,"MIN(",4) ) { ptr = function(convert,key,is_gtf); } \ + else if ( !strncasecmp(key,"MEAN(",5) ) { ptr = function(convert,key,is_gtf); } \ + else if ( !strncasecmp(key,"MEDIAN(",7) ) { ptr = function(convert,key,is_gtf); } \ + else if ( !strncasecmp(key,"AVG(",4) ) { ptr = function(convert,key,is_gtf); } \ + else if ( !strncasecmp(key,"SUM(",4) ) { ptr = function(convert,key,is_gtf); } \ + else if ( !strncasecmp(key,"ABS(",4) ) { ptr = function(convert,key,is_gtf); } \ + else if ( !strncasecmp(key,"COUNT(",6) ) { ptr = function(convert,key,is_gtf); } \ + else if ( !strncasecmp(key,"STDEV(",6) ) { ptr = function(convert,key,is_gtf); } \ + else if ( !strncasecmp(key,"STRLEN(",7) ) { ptr = function(convert,key,is_gtf); } \ + else if ( !strncasecmp(key,"BINOM(",6) ) { ptr = function(convert,key,is_gtf); } \ + else if ( !strncasecmp(key,"PHRED(",6) ) { ptr = function(convert,key,is_gtf); } \ + else if ( !strncasecmp(key,"SMPL_MAX(",9) ) { ptr = function(convert,key,is_gtf); } \ + else if ( !strncasecmp(key,"SMPL_MIN(",9) ) { ptr = function(convert,key,is_gtf); } \ + else if ( !strncasecmp(key,"SMPL_MEAN(",10) ) { ptr = function(convert,key,is_gtf); } \ + else if ( !strncasecmp(key,"SMPL_MEDIAN(",12) ) { ptr = function(convert,key,is_gtf); } \ + else if ( !strncasecmp(key,"SMPL_AVG(",9) ) { ptr = function(convert,key,is_gtf); } \ + else if ( !strncasecmp(key,"SMPL_STDEV(",11) ) { ptr = function(convert,key,is_gtf); } \ + else if ( !strncasecmp(key,"SMPL_SUM(",9) ) { ptr = function(convert,key,is_gtf); } \ + else if ( !strncasecmp(key,"sMAX(",5) ) { ptr = function(convert,key,is_gtf); } \ + else if ( !strncasecmp(key,"sMIN(",5) ) { ptr = function(convert,key,is_gtf); } \ + else if ( !strncasecmp(key,"sMEAN(",6) ) { ptr = function(convert,key,is_gtf); } \ + else if ( !strncasecmp(key,"sMEDIAN(",8) ) { ptr = function(convert,key,is_gtf); } \ + else if ( !strncasecmp(key,"sAVG(",5) ) { ptr = function(convert,key,is_gtf); } \ + else if ( !strncasecmp(key,"sSTDEV(",7) ) { ptr = function(convert,key,is_gtf); } \ + else if ( !strncasecmp(key,"sSUM(",5) ) { ptr = function(convert,key,is_gtf); } + static void set_type(fmt_t *fmt, int type) { fmt->type = type; } static fmt_t *register_tag(convert_t *convert, char *key, int is_gtf, int type) { @@ -1275,8 +1373,8 @@ static fmt_t *register_tag(convert_t *convert, char *key, int is_gtf, int type) if ( fmt->type==T_FORMAT && !bcf_hdr_idinfo_exists(convert->header,BCF_HL_FMT,id) ) { _SET_NON_FORMAT_TAGS(set_type,key,fmt) - else if ( !strcmp("ALT",key) ) { fmt->type = T_ALT; } - else if ( !strcmp("_CHROM_POS_ID",key) ) { fmt->type = T_CHROM_POS_ID; } + else if ( !strcmp("ALT",key) ) { fmt->type = T_ALT; } + else if ( !strcmp("_CHROM_POS_ID",key) ) { fmt->type = T_CHROM_POS_ID; } else if ( !strcmp("RSX",key) ) { fmt->type = T_RSX; } else if ( !strcmp("VKX",key) ) { fmt->type = T_VKX; } else if ( id>=0 && bcf_hdr_idinfo_exists(convert->header,BCF_HL_INFO,id) ) @@ -1297,6 +1395,14 @@ static fmt_t *register_tag(convert_t *convert, char *key, int is_gtf, int type) convert->max_unpack |= filter_max_unpack(flt); fmt->usr = (void*) flt; } + else if ( fmt->type==T_FILTER_EXPR ) + { + filter_t *filter = filter_init(convert->header,key); + convert->max_unpack |= filter_max_unpack(filter); + filter_expr_t *dat = calloc(1,sizeof(filter_expr_t)); + fmt->usr = dat; + dat->filter = filter; + } } switch (fmt->type) @@ -1334,6 +1440,7 @@ static fmt_t *register_tag(convert_t *convert, char *key, int is_gtf, int type) case T_VKX: fmt->handler = &process_variantkey_hex; break; case T_PBINOM: fmt->handler = &process_pbinom; convert->max_unpack |= BCF_UN_FMT; break; case T_NPASS: fmt->handler = &process_npass; fmt->destroy = &destroy_npass; break; + case T_FILTER_EXPR: fmt->handler = &process_filter_expr; fmt->destroy = &destroy_filter_expr; break; default: error("TODO: handler for type %d\n", fmt->type); } if ( key && fmt->type==T_INFO ) @@ -1362,14 +1469,28 @@ static int parse_subscript(char **p) static char *parse_tag(convert_t *convert, char *p, int is_gtf) { + int is_vcf_column = p[1]=='/' ? 1 : 0; + if ( is_vcf_column ) p++; + char *q = ++p; while ( *q && (isalnum(*q) || *q=='_' || *q=='.') ) q++; kstring_t str = {0,0,0}; if ( q-p==0 ) error("Could not parse format string: %s\n", convert->format_str); kputsn(p, q-p, &str); - if ( is_gtf ) + if ( is_gtf && is_vcf_column ) + { + _SET_NON_FORMAT_TAGS(register_tag, str.s, convert, str.s, is_gtf) + else if ( !strcmp(str.s, "ALT") ) + { + fmt_t *fmt = register_tag(convert, str.s, is_gtf, T_ALT); + fmt->subscript = parse_subscript(&q); + } + else error("Could not parse tag: %s .. %s\n", str.s,convert->format_str); + } + else if ( is_gtf ) { - if ( !strcmp(str.s, "SAMPLE") ) register_tag(convert, "SAMPLE", is_gtf, T_SAMPLE); + _SET_FILTER_EXPR(convert,set_filter_expr,p,q,1) + else if ( !strcmp(str.s, "SAMPLE") ) register_tag(convert, "SAMPLE", is_gtf, T_SAMPLE); else if ( !strcmp(str.s, "GT") ) register_tag(convert, "GT", is_gtf, T_GT); else if ( !strcmp(str.s, "TGT") ) register_tag(convert, "GT", is_gtf, T_TGT); else if ( !strcmp(str.s, "TBCSQ") ) @@ -1424,6 +1545,7 @@ static char *parse_tag(convert_t *convert, char *p, int is_gtf) else { _SET_NON_FORMAT_TAGS(register_tag, str.s, convert, str.s, is_gtf) + else _SET_FILTER_EXPR(convert,set_filter_expr,p,q,0) else if ( !strcmp(str.s, "ALT") ) { fmt_t *fmt = register_tag(convert, str.s, is_gtf, T_ALT); diff --git a/bcftools/csq.c b/bcftools/csq.c index b38eba10..53fa4daa 100644 --- a/bcftools/csq.c +++ b/bcftools/csq.c @@ -1,6 +1,6 @@ /* The MIT License - Copyright (c) 2016-2024 Genome Research Ltd. + Copyright (c) 2016-2025 Genome Research Ltd. Author: Petr Danecek @@ -218,6 +218,10 @@ #define CSQ_PRN_NMD (~(CSQ_INTRON|CSQ_NON_CODING)) #define CSQ_PRN_BIOTYPE CSQ_NON_CODING +#define CHR_VCF 0 +#define CHR_GFF 1 +#define CHR_FAI 2 + // see kput_vcsq() const char *csq_strings[] = { @@ -367,15 +371,24 @@ typedef struct { int mstack; hstack_t *stack; - gf_tscript_t *tr; // tr->ref: spliced transcript on ref strand - kstring_t sseq; // spliced haplotype sequence on ref strand - kstring_t tseq; // the variable part of translated haplotype transcript, coding strand - kstring_t tref; // the variable part of translated reference transcript, coding strand - uint32_t sbeg; // stack's sbeg, for cases first node's type is HAP_SSS + gf_tscript_t *tr; // tr->ref: spliced transcript on ref strand + kstring_t sseq; // spliced haplotype sequence on ref strand + kstring_t tseq; // the variable part of translated haplotype transcript, coding strand + kstring_t tref; // the variable part of translated reference transcript, coding strand + kstring_t tseq_stop; // the stop/start codons in tseq and tref + kstring_t tref_stop; // + uint32_t sbeg; // stack's sbeg, for cases first node's type is HAP_SSS int upstream_stop; } hap_t; +typedef struct +{ + int id; + const char *name, *code, *stop; +} +gencode_t; + typedef struct _args_t { // the main regidx lookups, from chr:beg-end to overlapping features and @@ -413,14 +426,18 @@ typedef struct _args_t int ncsq2_max, nfmt_bcsq; // maximum number of csq per site that can be accessed from FORMAT/BCSQ (*2 and 1 bit skipped to avoid BCF missing values) int ncsq2_small_warned; int brief_predictions; - int unify_chr_names; - char *chr_name; + char *unify_chr_names; // e.g. chr,Chromosome,-; prefixes in VCF,GFF,fasta + char *unify_chr_names_err; + char *chr_prefix[3]; // chr prefix to trim in VCF,GFF,fasta. See also CHR_VCF,CHR_GFF,CHR_FAI + char *chr_name, *chr_names[3]; int mchr_name; struct { int unknown_chr,unknown_tscript_biotype,unknown_strand,unknown_phase,duplicate_id; - int unknown_cds_phase,incomplete_cds,wrong_phase,overlapping_cds; + int unknown_cds_phase,incomplete_cds,wrong_phase,overlapping_cds,ref_allele_mismatch; } warned; + char *gencode_str; // which genetic code table to use + gencode_t *gencode; // genetic code table int rid; // current chromosome tr_heap_t *active_tr; // heap of active transcripts for quick flushing hap_t *hap; // transcript haplotype recursion @@ -440,8 +457,94 @@ typedef struct _args_t } args_t; +// Generated with misc/gencode-tables // AAA, AAC, ... -const char *gencode = "KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSSS*CWCLFLF"; +gencode_t gencode_tables[] = +{ + {.id=0, .name="Standard sipmlified", + .code="KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSSS*CWCLFLF", + .stop="--------------M---------------------------------*-*-----*-------" }, + {.id=1, .name="Standard", + .code="KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSSS*CWCLFLF", + .stop="--------------M---------------M-----------------*-*-----*-----M-" }, + {.id=2, .name="Vertebrate Mitochondrial", + .code="KNKNTTTT*S*SMIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSSSWCWCLFLF", + .stop="--------*-*-MMMM------------------------------M-*-*-------------" }, + {.id=3, .name="Yeast Mitochondrial", + .code="KNKNTTTTRSRSMIMIQHQHPPPPRRRRTTTTEDEDAAAAGGGGVVVV*Y*YSSSSWCWCLFLF", + .stop="------------M-M-------------------------------M-*-*-------------" }, + {.id=4, .name="Mold Mitochondrial; Protozoan Mitochondrial; Coelenterate Mitochondrial; Mycoplasma; Spiroplasma", + .code="KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSSSWCWCLFLF", + .stop="------------MMMM--------------M---------------M-*-*---------M-M-" }, + {.id=5, .name="Invertebrate Mitochondrial", + .code="KNKNTTTTSSSSMIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSSSWCWCLFLF", + .stop="------------MMMM------------------------------M-*-*-----------M-" }, + {.id=6, .name="Ciliate Nuclear; Dasycladacean Nuclear; Hexamita Nuclear", + .code="KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVVQYQYSSSS*CWCLFLF", + .stop="--------------M-----------------------------------------*-------" }, + {.id=9, .name="Echinoderm Mitochondrial; Flatworm Mitochondrial", + .code="NNKNTTTTSSSSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSSSWCWCLFLF", + .stop="--------------M-------------------------------M-*-*-------------" }, + {.id=10, .name="Euplotid Nuclear", + .code="KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSSSCCWCLFLF", + .stop="--------------M---------------------------------*-*-------------" }, + {.id=11, .name="Bacterial, Archaeal and Plant Plastid", + .code="KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSSS*CWCLFLF", + .stop="------------MMMM--------------M---------------M-*-*-----*-----M-" }, + {.id=12, .name="Alternative Yeast Nuclear", + .code="KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLSLEDEDAAAAGGGGVVVV*Y*YSSSS*CWCLFLF", + .stop="--------------M---------------M-----------------*-*-----*-------" }, + {.id=13, .name="Ascidian Mitochondrial", + .code="KNKNTTTTGSGSMIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSSSWCWCLFLF", + .stop="------------M-M-------------------------------M-*-*-----------M-" }, + {.id=14, .name="Alternative Flatworm Mitochondrial", + .code="NNKNTTTTSSSSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVVYY*YSSSSWCWCLFLF", + .stop="--------------M-----------------------------------*-------------" }, + {.id=15, .name="Blepharisma Nuclear", + .code="KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*YQYSSSS*CWCLFLF", + .stop="--------------M---------------------------------*-------*-------" }, + {.id=16, .name="Chlorophycean Mitochondrial", + .code="KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*YLYSSSS*CWCLFLF", + .stop="--------------M---------------------------------*-------*-------" }, + {.id=21, .name="Trematode Mitochondrial", + .code="NNKNTTTTSSSSMIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSSSWCWCLFLF", + .stop="--------------M-------------------------------M-*-*-------------" }, + {.id=22, .name="Scenedesmus obliquus Mitochondrial Code", + .code="KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*YLY*SSS*CWCLFLF", + .stop="--------------M---------------------------------*---*---*-------" }, + {.id=23, .name="Thraustochytrium mitochondrial code", + .code="KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSSS*CWC*FLF", + .stop="--------------MM------------------------------M-*-*-----*---*---" }, + {.id=24, .name="Pterobranchia Mitochondrial", + .code="KNKNTTTTSSKSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSSSWCWCLFLF", + .stop="--------------M---------------M---------------M-*-*-----------M-" }, + {.id=25, .name="Candidate Division SR1 and Gracilibacteria", + .code="KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSSSGCWCLFLF", + .stop="--------------M-------------------------------M-*-*-----------M-" }, + {.id=26, .name="Pachysolen tannophilus Nuclear Code", + .code="KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLALEDEDAAAAGGGGVVVV*Y*YSSSS*CWCLFLF", + .stop="--------------M---------------M-----------------*-*-----*-------" }, + {.id=27, .name="Karyorelict Nuclear", + .code="KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVVQYQYSSSSWCWCLFLF", + .stop="--------------M-----------------------------------------*-------" }, + {.id=28, .name="Condylostoma Nuclear", + .code="KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVVQYQYSSSSWCWCLFLF", + .stop="--------------M---------------------------------*-*-----*-------" }, + {.id=29, .name="Mesodinium Nuclear", + .code="KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVVYYYYSSSS*CWCLFLF", + .stop="--------------M-----------------------------------------*-------" }, + {.id=30, .name="Peritrich Nuclear", + .code="KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVVEYEYSSSS*CWCLFLF", + .stop="--------------M-----------------------------------------*-------" }, + {.id=31, .name="Blastocrithidia Nuclear", + .code="KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVVEYEYSSSSWCWCLFLF", + .stop="--------------M---------------------------------*-*-------------" }, + {.id=33, .name="Cephalodiscidae Mitochondrial UAA-Tyr", + .code="KNKNTTTTSSKSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVVYY*YSSSSWCWCLFLF", + .stop="--------------M---------------M---------------M---*-----------M-" }, + {.id=-1, .name=NULL, .code=NULL, .stop=NULL} +}; +gencode_t *gencode = NULL; const uint8_t nt4[] = { 4,4,4,4, 4,4,4,4, 4,4,4,4, 4,4,4,4, @@ -464,8 +567,10 @@ const uint8_t cnt4[] = 4,3,4,2, 4,4,4,1, 4,4,4,4, 4,4,4,4, 4,4,4,4, 0 }; -#define dna2aa(x) gencode[ nt4[(uint8_t)(x)[0]]<<4 | nt4[(uint8_t)(x)[1]]<<2 | nt4[(uint8_t)(x)[2]] ] -#define cdna2aa(x) gencode[ cnt4[(uint8_t)(x)[2]]<<4 | cnt4[(uint8_t)(x)[1]]<<2 | cnt4[(uint8_t)(x)[0]] ] +#define dna2aa(x) gencode->code[ nt4[(uint8_t)(x)[0]]<<4 | nt4[(uint8_t)(x)[1]]<<2 | nt4[(uint8_t)(x)[2]] ] +#define cdna2aa(x) gencode->code[ cnt4[(uint8_t)(x)[2]]<<4 | cnt4[(uint8_t)(x)[1]]<<2 | cnt4[(uint8_t)(x)[0]] ] +#define dna2stop(x) gencode->stop[ nt4[(uint8_t)(x)[0]]<<4 | nt4[(uint8_t)(x)[1]]<<2 | nt4[(uint8_t)(x)[2]] ] +#define cdna2stop(x) gencode->stop[ cnt4[(uint8_t)(x)[2]]<<4 | cnt4[(uint8_t)(x)[1]]<<2 | cnt4[(uint8_t)(x)[0]] ] static inline int ncsq2_to_nfmt(int ncsq2) { @@ -477,6 +582,111 @@ static inline void icsq2_to_bit(int icsq2, int *ival, int *ibit) *ibit = icsq2 % 30; } +static void init_gencode(args_t *args) +{ + int i,j,k; + if ( !args->gencode_str ) args->gencode_str = "0"; + if ( !strcasecmp("l",args->gencode_str) ) + { + printf("# The tables are ordered by codon as AAA,AAC,AAG,...,ACA,ACC,...,TTT:\n"); + printf("#\n#\t"); + for (i=0; i<4; i++) + { + for (j=0; j<16; j++) printf("%c","ACGT"[i]); + } + printf("\n#\t"); + for (i=0; i<4; i++) + { + for (j=0; j<4; j++) + { + for (k=0; k<4; k++) printf("%c","ACGT"[j]); + } + } + printf("\n#\t"); + for (i=0; i<16; i++) + { + for (k=0; k<4; k++) printf("%c","ACGT"[k]); + } + printf("\n#\n\n"); + + for (i=0; gencode_tables[i].id >= 0; i++) + { + gencode_t *gc = &gencode_tables[i]; + printf("%d\t%s\n\t%s\n\t%s\n\n",gc->id,gc->name,gc->code,gc->stop); + } + exit(0); + } + char *tmp; + int id = strtol(args->gencode_str,&tmp,10); + if ( *tmp ) error("Could not parse argument: --genetic-code %s\n",args->gencode_str); + for (i=0; gencode_tables[i].id >= 0; i++) + { + gencode_t *gc = &gencode_tables[i]; + if ( gc->id==id ) + { + gencode = gc; + break; + } + } + if ( !gencode ) error("Could not parse argument: --genetic-code %s, no such table\n",args->gencode_str); +} +void test_prefix(kstring_t *str, const char *seq) +{ + if ( !strncasecmp(seq,"chromosome_",11) ) kputsn(seq,11,str); + else if ( !strncasecmp(seq,"chromosome",10) ) kputsn(seq,10,str); + else if ( !strncasecmp(seq,"chrom_",6) ) kputsn(seq,6,str); + else if ( !strncasecmp(seq,"chrom",5) ) kputsn(seq,5,str); + else if ( !strncasecmp(seq,"chr_",4) ) kputsn(seq,4,str); + else if ( !strncasecmp(seq,"chr",3) ) kputsn(seq,3,str); + else kputs("-",str); +} +void init_chr_names(args_t *args) +{ + // init chr prefixes to trim + int i,n; + char **tmp; + + // chr prefixes given explicitly + args->unify_chr_names_err = strdup("check if --unify-chr-names or --force could help"); + if ( args->unify_chr_names && (tmp=hts_readlist(args->unify_chr_names,0,&n)) ) + { + if ( n!=3 ) error("Error: expected three strings, got --unify-chr-names %s\n",args->unify_chr_names); + for (i=0; i<3; i++) + if ( strcmp("-",tmp[i]) ) args->chr_prefix[i] = tmp[i]; + else free(tmp[i]); + free(tmp); + return; + } + + int nseq; + const char **vcf = bcf_hdr_seqnames(args->hdr, &nseq); + if ( !vcf ) return; + const char *seq_vcf = vcf[0]; + const char *seq_gff = gff_iseq(args->gff,0); + const char *seq_fa = faidx_iseq(args->fai,0); + free(vcf); + if ( !strcmp(seq_vcf,seq_fa) && !strcmp(seq_vcf,seq_gff) ) return; + + // First sequences not identical: either they have different prefix or they are in different order. + // See if we can suggest the --unify-chr-names parameter to use + kstring_t chr_vcf = {0,0,0}, chr_gff = {0,0,0}, chr_fa = {0,0,0}, str = {0,0,0}; + test_prefix(&chr_vcf, seq_vcf); + test_prefix(&chr_gff, seq_gff); + test_prefix(&chr_fa, seq_fa); + int same_chr = 1; + if ( strcmp(!strcmp("-",chr_vcf.s)?seq_vcf:seq_vcf+chr_vcf.l,!strcmp("-",chr_gff.s)?seq_gff:seq_gff+chr_gff.l) ) same_chr = 0; + if ( strcmp(!strcmp("-",chr_gff.s)?seq_gff:seq_gff+chr_gff.l,!strcmp("-",chr_fa.s)?seq_fa:seq_fa+chr_fa.l) ) same_chr = 0; + if ( strcmp(!strcmp("-",chr_fa.s)?seq_fa:seq_fa+chr_fa.l,!strcmp("-",chr_vcf.s)?seq_vcf:seq_vcf+chr_vcf.l) ) same_chr = 0; + free(args->unify_chr_names_err); + if ( same_chr ) + ksprintf(&str,"the first sequence name in VCF/GFF/fasta is %s/%s/%s, try to run with --unify-chr-names %s,%s,%s\n",seq_vcf,seq_gff,seq_fa,chr_vcf.s,chr_gff.s,chr_fa.s); + else + ksprintf(&str,"the first sequence name in VCF/GFF/fasta is %s/%s/%s, check if running with --unify-chr-names or --force coud help\n",seq_vcf,seq_gff,seq_fa); + free(chr_vcf.s); + free(chr_gff.s); + free(chr_fa.s); + args->unify_chr_names_err = str.s; +} void init_data(args_t *args) { args->nfmt_bcsq = ncsq2_to_nfmt(args->ncsq2_max); @@ -486,7 +696,6 @@ void init_data(args_t *args) args->gff = gff_init(args->gff_fname); gff_set(args->gff,verbosity,args->verbosity); - gff_set(args->gff,strip_chr_names,args->unify_chr_names); gff_set(args->gff,force_out_of_phase,args->force); gff_set(args->gff,dump_fname,args->dump_gff); gff_parse(args->gff); @@ -496,6 +705,8 @@ void init_data(args_t *args) args->idx_tscript = gff_get(args->gff,idx_tscript); args->itr = regitr_init(NULL); + init_chr_names(args); + args->rid = -1; if ( args->filter_str ) @@ -579,6 +790,13 @@ void destroy_data(args_t *args) kh_destroy(pos2vbuf,args->pos2vbuf); if ( args->smpl ) smpl_ilist_destroy(args->smpl); int i,j,ret; + for (i=0; i<3; i++) + { + free(args->chr_prefix[i]); + free(args->chr_names[i]); + } + free(args->chr_name); + if ( args->out_fh ) { if ( args->write_index ) @@ -617,12 +835,14 @@ void destroy_data(args_t *args) free(args->hap->sseq.s); free(args->hap->tseq.s); free(args->hap->tref.s); + free(args->hap->tseq_stop.s); + free(args->hap->tref_stop.s); free(args->hap); fai_destroy(args->fai); free(args->gt_arr); free(args->str.s); free(args->str2.s); - free(args->chr_name); + free(args->unify_chr_names_err); } /* @@ -666,6 +886,7 @@ void splice_init(splice_t *splice, bcf1_t *rec) } static inline void splice_build_hap(splice_t *splice, uint32_t beg, int len) { + // beg .. the beggining of the splice region // len>0 .. beg is the first base, del filled from right // len<0 .. beg is the last base, del filled from left @@ -681,8 +902,24 @@ static inline void splice_build_hap(splice_t *splice, uint32_t beg, int len) } else { - rbeg = abeg = beg; - rlen = alen = len; + if ( beg < splice->tr->beg ) + { + // This can happen with very short exons and introns. Not a real biology, but the program + // should not crash on it. This is not a real fix, the code would need a revamp to handle + // well cases like this, see test/csq/ENSCAFT00000047742 + // >chr9:104-110 + // ATGTCAGGGCC + // ATGTC-GGGCC + // 456 + // eee.eee + rbeg = abeg = splice->tr->beg; + rlen = alen = 0; + } + else + { + rbeg = abeg = beg; + rlen = alen = len; + } // check for incomplete del as above?? } @@ -808,20 +1045,32 @@ fprintf(stderr,"csq_stage_splice %d: type=%d\n",(int)rec->pos+1,type); csq.type.gene = tr->gene->name; csq_stage(args, &csq, rec); } -static inline const char *drop_chr_prefix(args_t *args, const char *chr) +static inline const char *unify_chr_name(args_t *args, const char *chr, int isrc, int idst) { - if ( !args->unify_chr_names ) return chr; - if ( !strncasecmp("chr",chr,3) ) return chr+3; - return chr; -} -static inline const char *add_chr_prefix(args_t *args, const char *chr) -{ - if ( !args->unify_chr_names ) return chr; - int len = strlen(chr); - hts_expand(char,len+4,args->mchr_name,args->chr_name); - memcpy(args->chr_name,"chr",3); - memcpy(args->chr_name+3,chr,len+1); - return args->chr_name; + if ( !args->chr_prefix[isrc] && !args->chr_prefix[idst] ) return chr; + + int off = 0, len = strlen(chr); + if ( args->chr_prefix[isrc] ) + { + off = strlen(args->chr_prefix[isrc]); + len -= off; + if ( strncmp(args->chr_prefix[isrc],chr,off) ) + error("Error: failed to unify chr names, cannot strip \"%s\" from \"%s\"\n",args->chr_prefix[isrc],chr); + } + hts_expand(char,len+1,args->mchr_name,args->chr_name); + memcpy(args->chr_name,chr+off,len+1); + + if ( args->chr_prefix[idst] ) + { + off = strlen(args->chr_prefix[idst]); + hts_expand(char,len+off+1,args->mchr_name,args->chr_name); + memmove(args->chr_name+off,args->chr_name,len+1); + memcpy(args->chr_name,args->chr_prefix[idst],off); + } + + free(args->chr_names[idst]); + args->chr_names[idst] = strdup(args->chr_name); + return args->chr_names[idst]; } static inline int splice_csq_ins(args_t *args, splice_t *splice, uint32_t ex_beg, uint32_t ex_end) { @@ -848,7 +1097,7 @@ fprintf(stderr,"ins: %s>%s .. ex=%d,%d beg,end=%d,%d tbeg,tend=%d,%d check_ut if ( splice->check_utr ) { regitr_t *itr = regitr_init(NULL); - const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,splice->vcf.rec)); + const char *chr = unify_chr_name(args, bcf_seqname(args->hdr,splice->vcf.rec),CHR_VCF,CHR_GFF); if ( regidx_overlap(args->idx_utr,chr,splice->ref_beg+1,splice->ref_beg+1, itr) ) // adjacent utr { ret = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq, splice->vcf.ial); @@ -886,7 +1135,7 @@ fprintf(stderr,"ins: %s>%s .. ex=%d,%d beg,end=%d,%d tbeg,tend=%d,%d check_ut if ( splice->check_utr ) { regitr_t *itr = regitr_init(NULL); - const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,splice->vcf.rec)); + const char *chr = unify_chr_name(args, bcf_seqname(args->hdr,splice->vcf.rec),CHR_VCF,CHR_GFF); if ( regidx_overlap(args->idx_utr,chr,splice->ref_end-1,splice->ref_end-1, itr) ) // adjacent utr { ret = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq, splice->vcf.ial); @@ -1065,7 +1314,7 @@ fprintf(stderr,"splice_csq_del: %s>%s .. ex=%d,%d beg,end=%d,%d tbeg,tend=%d,% if ( splice->check_utr ) { regitr_t *itr = regitr_init(NULL); - const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,splice->vcf.rec)); + const char *chr = unify_chr_name(args, bcf_seqname(args->hdr,splice->vcf.rec),CHR_VCF,CHR_GFF); if ( regidx_overlap(args->idx_utr,chr,splice->ref_beg,ex_beg-1, itr) ) // adjacent utr csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq, splice->vcf.ial); regitr_destroy(itr); @@ -1093,7 +1342,9 @@ fprintf(stderr,"splice_csq_del: %s>%s .. ex=%d,%d beg,end=%d,%d tbeg,tend=%d,% { if ( splice->check_donor && splice->tr->strand==STRAND_REV ) splice->csq |= CSQ_SPLICE_DONOR; if ( splice->check_acceptor && splice->tr->strand==STRAND_FWD ) splice->csq |= CSQ_SPLICE_ACCEPTOR; - if ( ref && alt && !strncmp(ref+N_SPLICE_REGION_INTRON-N_SPLICE_DONOR,alt+N_SPLICE_REGION_INTRON-N_SPLICE_DONOR,N_SPLICE_DONOR) ) splice->csq |= CSQ_SYNONYMOUS_VARIANT; + + int noff = N_SPLICE_REGION_INTRON - N_SPLICE_DONOR; + if ( ref && alt && noffkref.l && noffkalt.l && !strncmp(ref+noff,alt+noff,N_SPLICE_DONOR) ) splice->csq |= CSQ_SYNONYMOUS_VARIANT; } } } @@ -1121,7 +1372,7 @@ fprintf(stderr,"splice_csq_del: %s>%s .. ex=%d,%d beg,end=%d,%d tbeg,tend=%d,% if ( splice->check_utr ) { regitr_t *itr = regitr_init(NULL); - const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,splice->vcf.rec)); + const char *chr = unify_chr_name(args, bcf_seqname(args->hdr,splice->vcf.rec),CHR_VCF,CHR_GFF); if ( regidx_overlap(args->idx_utr,chr,ex_end+1,splice->ref_end, itr) ) // adjacent utr csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq, splice->vcf.ial); regitr_destroy(itr); @@ -1212,7 +1463,7 @@ fprintf(stderr,"mnp: %s>%s .. ex=%d,%d beg,end=%d,%d tbeg,tend=%d,%d check_ut if ( splice->check_utr ) { regitr_t *itr = regitr_init(NULL); - const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,splice->vcf.rec)); + const char *chr = unify_chr_name(args, bcf_seqname(args->hdr,splice->vcf.rec),CHR_VCF,CHR_GFF); if ( regidx_overlap(args->idx_utr,chr,splice->ref_beg,ex_beg-1, itr) ) // adjacent utr csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq, splice->vcf.ial); regitr_destroy(itr); @@ -1242,7 +1493,7 @@ fprintf(stderr,"mnp: %s>%s .. ex=%d,%d beg,end=%d,%d tbeg,tend=%d,%d check_ut if ( splice->check_utr ) { regitr_t *itr = regitr_init(NULL); - const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,splice->vcf.rec)); + const char *chr = unify_chr_name(args, bcf_seqname(args->hdr,splice->vcf.rec),CHR_VCF,CHR_GFF); if ( regidx_overlap(args->idx_utr,chr,ex_end+1,splice->ref_end, itr) ) // adjacent utr csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq, splice->vcf.ial); regitr_destroy(itr); @@ -1348,10 +1599,12 @@ int hap_init(args_t *args, hap_node_t *parent, hap_node_t *child, gf_cds_t *cds, if ( tr->strand==STRAND_FWD ) { if ( child->icds==tr->ncds-1 ) splice.check_stop = 1; } else if ( tr->strand==STRAND_REV ) { if ( child->icds==0 ) splice.check_stop = 1; } } - if ( splice.check_start ) // do not check starts in incomplete CDS, defined as not starting with M + if ( splice.check_start ) { - if ( tr->strand==STRAND_FWD ) { if ( dna2aa(TSCRIPT_AUX(tr)->ref+N_REF_PAD+cds->beg-tr->beg) != 'M' ) splice.check_start = 0; } - else if ( tr->strand==STRAND_REV ) { if ( cdna2aa(TSCRIPT_AUX(tr)->ref+N_REF_PAD+cds->beg-tr->beg+cds->len-3) != 'M' ) splice.check_start = 0; } + // Do not check starts in incomplete CDS, defined as not starting with M + // Not this is not always true, there are alternative start codons + if ( tr->strand==STRAND_FWD ) { if ( dna2stop(TSCRIPT_AUX(tr)->ref+N_REF_PAD+cds->beg-tr->beg) != 'M' ) splice.check_start = 0; } + else if ( tr->strand==STRAND_REV ) { if ( cdna2stop(TSCRIPT_AUX(tr)->ref+N_REF_PAD+cds->beg-tr->beg+cds->len-3) != 'M' ) splice.check_start = 0; } } if ( child->icds!=0 ) splice.check_region_beg = 1; if ( child->icds!=tr->ncds-1 ) splice.check_region_end = 1; @@ -1365,7 +1618,7 @@ fprintf(stderr,"cds splice_csq: %d [%s][%s] .. beg,end=%d %d, ret=%d, csq=%d\n\n #endif if ( ret==SPLICE_VAR_REF ) return 2; // not a variant, eg REF=CA ALT=CA - if ( ret==SPLICE_OUTSIDE || ret==SPLICE_OVERLAP || splice.csq==CSQ_START_LOST ) // not a coding csq + if ( ret==SPLICE_OUTSIDE || ret==SPLICE_OVERLAP ) // not a coding csq { free(splice.kref.s); free(splice.kalt.s); @@ -1493,7 +1746,7 @@ void hap_destroy(hap_node_t *hap) tseq: translated sequence (aa) fill: frameshift, fill until the end (strand=fwd) or from the start (strand=rev) */ -void cds_translate(kstring_t *_ref, kstring_t *_seq, uint32_t sbeg, uint32_t rbeg, uint32_t rend, int strand, kstring_t *tseq, int fill) +void cds_translate(kstring_t *_ref, kstring_t *_seq, uint32_t sbeg, uint32_t rbeg, uint32_t rend, int strand, kstring_t *tseq, kstring_t *tseq_stop, int fill) { #if XDBG fprintf(stderr,"\ntranslate: %d %d %d fill=%d seq.l=%d\n",sbeg,rbeg,rend,fill,(int)_seq->l); @@ -1505,9 +1758,11 @@ fprintf(stderr,"\ntranslate: %d %d %d fill=%d seq.l=%d\n",sbeg,rbeg,rend,fill, kstring_t seq = *_seq; tseq->l = 0; + tseq_stop->l = 0; if ( !seq.l ) { kputc('?', tseq); + kputc('?', tseq_stop); return; } @@ -1541,6 +1796,7 @@ fprintf(stderr,"\ntranslate: %d %d %d fill=%d seq.l=%d\n",sbeg,rbeg,rend,fill, if ( i==3 ) { kputc_(dna2aa(tmp), tseq); + kputc_(dna2stop(tmp), tseq_stop); #if DBG>1 fprintf(stderr,"[1]%c%c%c\n",tmp[0],tmp[1],tmp[2]); #endif @@ -1549,6 +1805,7 @@ fprintf(stderr,"\ntranslate: %d %d %d fill=%d seq.l=%d\n",sbeg,rbeg,rend,fill, while ( codon < end ) { kputc_(dna2aa(codon), tseq); + kputc_(dna2stop(codon), tseq_stop); #if DBG>1 fprintf(stderr,"[2]%c%c%c\n",codon[0],codon[1],codon[2]); #endif @@ -1572,6 +1829,7 @@ fprintf(stderr,"\ntranslate: %d %d %d fill=%d seq.l=%d\n",sbeg,rbeg,rend,fill, codon++; } kputc_(dna2aa(tmp), tseq); + kputc_(dna2stop(tmp), tseq_stop); #if DBG>1 fprintf(stderr,"[4]%c%c%c\n",tmp[0],tmp[1],tmp[2]); #endif @@ -1582,6 +1840,7 @@ fprintf(stderr,"\ntranslate: %d %d %d fill=%d seq.l=%d\n",sbeg,rbeg,rend,fill, while ( codon+3 <= end ) { kputc_(dna2aa(codon), tseq); + kputc_(dna2stop(codon), tseq_stop); #if DBG>1 fprintf(stderr,"[5]%c%c%c\t%c\n",codon[0],codon[1],codon[2],dna2aa(codon)); #endif @@ -1626,10 +1885,12 @@ fprintf(stderr,"\ntranslate: %d %d %d fill=%d seq.l=%d\n",sbeg,rbeg,rend,fill, fprintf(stderr,"[1]%c%c%c\t%c\n",tmp[0],tmp[1],tmp[2], cdna2aa(tmp)); #endif kputc_(cdna2aa(tmp), tseq); + kputc_(cdna2stop(tmp), tseq_stop); codon = end - 3; while ( codon >= seq.s ) { kputc_(cdna2aa(codon), tseq); + kputc_(cdna2stop(codon), tseq_stop); #if DBG>1 fprintf(stderr,"[2]%c%c%c\t%c\n",codon[0],codon[1],codon[2], cdna2aa(codon)); #endif @@ -1659,6 +1920,7 @@ fprintf(stderr,"\ntranslate: %d %d %d fill=%d seq.l=%d\n",sbeg,rbeg,rend,fill, { for (; i>=0 && end>=ref.s; i--) tmp[i] = *(--end); kputc_(cdna2aa(tmp), tseq); + kputc_(cdna2stop(tmp), tseq_stop); #if DBG>1 fprintf(stderr,"[4]%c%c%c\t%c\n",tmp[0],tmp[1],tmp[2],cdna2aa(tmp)); #endif @@ -1669,6 +1931,7 @@ fprintf(stderr,"\ntranslate: %d %d %d fill=%d seq.l=%d\n",sbeg,rbeg,rend,fill, while ( codon >= ref.s + N_REF_PAD ) { kputc_(cdna2aa(codon), tseq); + kputc_(cdna2stop(codon), tseq_stop); #if DBG>1 fprintf(stderr,"[5]%c%c%c\t%c\n",codon[0],codon[1],codon[2],cdna2aa(codon)); #endif @@ -1678,6 +1941,7 @@ fprintf(stderr,"\ntranslate: %d %d %d fill=%d seq.l=%d\n",sbeg,rbeg,rend,fill, } else error("Should not happen: %d\n", strand); kputc_(0,tseq); tseq->l--; + kputc_(0,tseq_stop); tseq_stop->l--; #if DBG fprintf(stderr," tseq: %s\n", tseq->s); #endif @@ -1771,6 +2035,9 @@ fprintf(stderr,"csq_push: %d .. %d\n",(int)rec->pos+1,csq->type.type); if ( vrec->vcsq[i].type&CSQ_STOP_RETAINED ) vrec->vcsq[i].type &= ~(CSQ_STOP_LOST|CSQ_SYNONYMOUS_VARIANT); + if ( vrec->vcsq[i].type&CSQ_START_RETAINED ) + vrec->vcsq[i].type &= ~(CSQ_START_LOST|CSQ_SYNONYMOUS_VARIANT); + if ( !vrec->vcsq[i].vstr.s ) vrec->vcsq[i].vstr = csq->type.vstr; goto exit_duplicate; } @@ -1868,14 +2135,14 @@ void kput_vcsq(args_t *args, vcsq_t *csq, kstring_t *str) kputs(csq->vstr.s, str); } -void kprint_aa_prediction(args_t *args, int beg, kstring_t *aa, kstring_t *str) +void kprint_aa_prediction(args_t *args, int beg, kstring_t *aa, kstring_t *stop, kstring_t *str) { if ( !args->brief_predictions || (int)aa->l - args->brief_predictions < 3 ) kputs(aa->s, str); else { int i, len = aa->l; - if ( aa->s[len-1]=='*' ) len--; + if ( stop->s[len-1]=='*' ) len--; for (i=0; ibrief_predictions; i++) kputc(aa->s[i], str); kputs("..", str); kputw(beg+len, str); @@ -1909,33 +2176,37 @@ void hap_add_csq(args_t *args, hap_t *hap, hap_node_t *node, int tlen, int ibeg, if ( hap->stack[ibeg].node->type != HAP_SSS ) { // check for truncating stops - for (i=0; itref.l; i++) - if ( hap->tref.s[i]=='*' ) break; - if ( i!=hap->tref.l ) + for (i=0; itref_stop.l; i++) + if ( hap->tref_stop.s[i]=='*' ) break; + if ( i!=hap->tref_stop.l ) { hap->tref.l = i+1; hap->tref.s[i+1] = 0; + hap->tref_stop.l = i+1; + hap->tref_stop.s[i+1] = 0; } - for (i=0; itseq.l; i++) - if ( hap->tseq.s[i]=='*' ) break; + for (i=0; itseq_stop.l; i++) + if ( hap->tseq_stop.s[i]=='*' ) break; if ( i!=hap->tseq.l ) { hap->tseq.l = i+1; hap->tseq.s[i+1] = 0; + hap->tseq_stop.l = i+1; + hap->tseq_stop.s[i+1] = 0; hap->upstream_stop = 1; } if ( csq->type.type & CSQ_STOP_LOST ) { - if ( hap->tref.s[hap->tref.l-1]=='*' && hap->tref.s[hap->tref.l-1] == hap->tseq.s[hap->tseq.l-1] ) + if ( hap->tref_stop.s[hap->tref_stop.l-1]=='*' && hap->tref_stop.s[hap->tref_stop.l-1] == hap->tseq_stop.s[hap->tseq_stop.l-1] ) { rm_csq |= CSQ_STOP_LOST; csq->type.type |= CSQ_STOP_RETAINED; } - else if ( hap->tref.s[hap->tref.l-1]!='*' ) + else if ( hap->tref_stop.s[hap->tref_stop.l-1]!='*' ) { // This is CDS 3' incomplete ENSG00000173376/synon.vcf, can also be missense // We observe in real data a change to a stop, ENST00000528237/retained-stop-incomplete-cds.vcf - if ( hap->tseq.s[hap->tseq.l-1] == '*' ) + if ( hap->tseq_stop.s[hap->tseq_stop.l-1] == '*' ) { rm_csq |= CSQ_STOP_GAINED; csq->type.type |= CSQ_STOP_RETAINED; @@ -1944,10 +2215,13 @@ void hap_add_csq(args_t *args, hap_t *hap, hap_node_t *node, int tlen, int ibeg, csq->type.type |= CSQ_INCOMPLETE_CDS; } } - if ( csq->type.type & CSQ_START_LOST && hap->tref.s[0]!='M' ) + if ( csq->type.type & CSQ_START_LOST ) { - rm_csq |= CSQ_START_LOST; - csq->type.type &= ~CSQ_START_LOST; + if ( hap->tref_stop.s[hap->tref_stop.l-1]=='M' && hap->tref_stop.s[hap->tref_stop.l-1] == hap->tseq_stop.s[hap->tseq_stop.l-1] ) + { + rm_csq |= CSQ_START_LOST; + csq->type.type |= CSQ_START_RETAINED; + } } if ( dlen!=0 ) { @@ -1957,7 +2231,7 @@ void hap_add_csq(args_t *args, hap_t *hap, hap_node_t *node, int tlen, int ibeg, csq->type.type |= CSQ_INFRAME_DELETION; else csq->type.type |= CSQ_INFRAME_INSERTION; - if ( hap->tref.s[hap->tref.l-1]!='*' && hap->tseq.s[hap->tseq.l-1]=='*' ) + if ( hap->tref_stop.s[hap->tref_stop.l-1]!='*' && hap->tseq_stop.s[hap->tseq_stop.l-1]=='*' ) csq->type.type |= CSQ_STOP_GAINED; } else @@ -1967,9 +2241,9 @@ void hap_add_csq(args_t *args, hap_t *hap, hap_node_t *node, int tlen, int ibeg, { if ( hap->tref.s[i] == hap->tseq.s[i] ) continue; aa_change = 1; - if ( hap->tref.s[i] == '*' ) + if ( hap->tref_stop.s[i] == '*' ) csq->type.type |= CSQ_STOP_LOST; - else if ( hap->tseq.s[i] == '*' ) + else if ( hap->tseq_stop.s[i] == '*' ) csq->type.type |= CSQ_STOP_GAINED; else csq->type.type |= CSQ_MISSENSE_VARIANT; @@ -1979,11 +2253,19 @@ void hap_add_csq(args_t *args, hap_t *hap, hap_node_t *node, int tlen, int ibeg, } } // Check if compound inframe variants are real inframes, or if the stop codon occurs before the frameshift can be restored - if ( ibeg!=iend && (csq->type.type & (CSQ_INFRAME_DELETION|CSQ_INFRAME_INSERTION|CSQ_INFRAME_ALTERING)) && hap->tseq.s[hap->tseq.l-1]=='*' ) + if ( ibeg!=iend && (csq->type.type & (CSQ_INFRAME_DELETION|CSQ_INFRAME_INSERTION|CSQ_INFRAME_ALTERING)) && hap->tseq_stop.s[hap->tseq_stop.l-1]=='*' ) { rm_csq |= CSQ_INFRAME_DELETION | CSQ_INFRAME_INSERTION | CSQ_INFRAME_ALTERING; csq->type.type |= CSQ_FRAMESHIFT_VARIANT | CSQ_STOP_GAINED; } + if ( csq->type.type & CSQ_FRAMESHIFT_VARIANT && csq->type.type & CSQ_START_LOST ) + { + // this is to prevent + // 4959 GA G start_lost|NBPF3|ENST00000318249|protein_coding|+ + // 4959 GA G start_lost|NBPF3|ENST00000318249|protein_coding|+|1M>1?|4959GA>G + rm_csq |= CSQ_FRAMESHIFT_VARIANT; + hap->stack[ibeg].node->type = HAP_SSS; + } if ( has_upstream_stop ) csq->type.type |= CSQ_UPSTREAM_STOP; csq->type.type &= ~rm_csq; @@ -2004,12 +2286,12 @@ void hap_add_csq(args_t *args, hap_t *hap, hap_node_t *node, int tlen, int ibeg, int aa_sbeg = tr->strand==STRAND_FWD ? node2sbeg(ibeg)/3+1 : (tlen - node2send(iend))/3+1; kputc_('|', &str); kputw(aa_rbeg, &str); - kprint_aa_prediction(args,aa_rbeg,&hap->tref,&str); + kprint_aa_prediction(args,aa_rbeg,&hap->tref,&hap->tref_stop,&str); if ( !(csq->type.type & CSQ_SYNONYMOUS_VARIANT) ) { kputc_('>', &str); kputw(aa_sbeg, &str); - kprint_aa_prediction(args,aa_sbeg,&hap->tseq,&str); + kprint_aa_prediction(args,aa_sbeg,&hap->tseq,&hap->tseq_stop,&str); } kputc_('|', &str); @@ -2080,6 +2362,7 @@ void hap_finalize(args_t *args, hap_t *hap) hap->sseq.l = 0; hap->tseq.l = 0; + hap->tseq_stop.l = 0; hap->stack[0].node = TSCRIPT_AUX(tr)->root; hap->stack[0].ichild = -1; hap->stack[0].slen = 0; @@ -2167,13 +2450,13 @@ void hap_finalize(args_t *args, hap_t *hap) } else // splice site overlap, see #1475227917 sseq.l = fill = 0; - cds_translate(&sref, &sseq, icur,rbeg,rend, tr->strand, &hap->tseq, fill); + cds_translate(&sref, &sseq, icur,rbeg,rend, tr->strand, &hap->tseq, &hap->tseq_stop, fill); // ref sseq.l = node2rend(i) - rbeg; sseq.s = sref.s + N_REF_PAD + rbeg; sseq.m = sref.m - 2*N_REF_PAD; - cds_translate(&sref, &sseq, rbeg,rbeg,rend, tr->strand, &hap->tref, fill); + cds_translate(&sref, &sseq, rbeg,rbeg,rend, tr->strand, &hap->tref, &hap->tref_stop, fill); sseq.m = sref.m - 2*N_REF_PAD + hap->stack[istack].dlen; hap_add_csq(args,hap,node,0, ibeg,i,dlen,indel); @@ -2225,13 +2508,13 @@ void hap_finalize(args_t *args, hap_t *hap) } else // splice site overlap, see #1475227917 sseq.l = fill = 0; - cds_translate(&sref, &sseq, icur,rbeg,rend, tr->strand, &hap->tseq, fill); + cds_translate(&sref, &sseq, icur,rbeg,rend, tr->strand, &hap->tseq, &hap->tseq_stop, fill); // ref sseq.l = node2rend(ibeg) - rbeg; sseq.s = sref.s + N_REF_PAD + rbeg; sseq.m = sref.m - 2*N_REF_PAD; - cds_translate(&sref, &sseq, rbeg,rbeg,rend, tr->strand, &hap->tref, fill); + cds_translate(&sref, &sseq, rbeg,rbeg,rend, tr->strand, &hap->tref, &hap->tref_stop, fill); sseq.m = sref.m - 2*N_REF_PAD + hap->stack[istack].dlen; hap_add_csq(args,hap,node,sseq.m, i,ibeg,dlen,indel); @@ -2489,13 +2772,7 @@ void tscript_init_ref(args_t *args, gf_tscript_t *tr, const char *chr) int i, len; int pad_beg = tr->beg >= N_REF_PAD ? N_REF_PAD : tr->beg; - const char *tmp_chr = chr; - if ( !faidx_has_seq(args->fai,tmp_chr) ) - { - tmp_chr = drop_chr_prefix(args,chr); - if ( !faidx_has_seq(args->fai,tmp_chr) ) tmp_chr = add_chr_prefix(args,chr); - } - TSCRIPT_AUX(tr)->ref = faidx_fetch_seq(args->fai, tmp_chr, tr->beg - pad_beg, tr->end + N_REF_PAD, &len); + TSCRIPT_AUX(tr)->ref = faidx_fetch_seq(args->fai, chr, tr->beg - pad_beg, tr->end + N_REF_PAD, &len); if ( !TSCRIPT_AUX(tr)->ref ) error("faidx_fetch_seq failed %s:%d-%d\n", chr,tr->beg+1,tr->end+1); @@ -2513,7 +2790,8 @@ void tscript_init_ref(args_t *args, gf_tscript_t *tr, const char *chr) } } -static void sanity_check_ref(args_t *args, gf_tscript_t *tr, bcf1_t *rec) +// returns 0 on success, negative number on reference mismatch +static int sanity_check_ref(args_t *args, gf_tscript_t *tr, bcf1_t *rec) { int vbeg = 0; int rbeg = rec->pos - tr->beg + N_REF_PAD; @@ -2525,23 +2803,40 @@ static void sanity_check_ref(args_t *args, gf_tscript_t *tr, bcf1_t *rec) while ( ref[i] && vcf[i] ) { if ( ref[i]!=vcf[i] && toupper(ref[i])!=toupper(vcf[i]) ) - error("Error: the fasta reference does not match the VCF REF allele at %s:%"PRId64" .. fasta=%c vcf=%c\n", - bcf_seqname(args->hdr,rec),(int64_t) rec->pos+vbeg+1,ref[i],vcf[i]); + { + if ( !args->force ) + error("Error: the fasta reference does not match the VCF REF allele at %s:%"PRId64" .. fasta=%c vcf=%c\n", + bcf_seqname(args->hdr,rec),(int64_t) rec->pos+vbeg+1,ref[i],vcf[i]); + + else if ( args->verbosity && (!args->warned.ref_allele_mismatch || args->verbosity > 1) ) + { + fprintf(stderr,"Warning: the fasta reference does not match the VCF REF allele at %s:%"PRId64" .. fasta=%c vcf=%c\n", + bcf_seqname(args->hdr,rec),(int64_t) rec->pos+vbeg+1,ref[i],vcf[i]); + if ( args->verbosity < 2 ) + fprintf(stderr," This message is printed only once, the verbosity can be increased with `--verbosity 2`\n"); + } + args->warned.ref_allele_mismatch++; + return -1; + } i++; } + return 0; } int test_cds_local(args_t *args, bcf1_t *rec) { int i,j, ret = 0; - const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,rec)); + const char *chr_vcf = bcf_seqname(args->hdr,rec); + const char *chr_gff = unify_chr_name(args, chr_vcf, CHR_VCF,CHR_GFF); + const char *chr_fai = unify_chr_name(args, chr_vcf, CHR_VCF,CHR_FAI); // note that the off-by-one extension of rlen is deliberate to account for insertions - if ( !regidx_overlap(args->idx_cds,chr,rec->pos,rec->pos+rec->rlen, args->itr) ) return 0; + if ( !regidx_overlap(args->idx_cds,chr_gff,rec->pos,rec->pos+rec->rlen, args->itr) ) return 0; // structures to fake the normal test_cds machinery hap_node_t root, node; root.type = HAP_ROOT; kstring_t *tref = &args->hap->tref, *tseq = &args->hap->tseq; + kstring_t *tref_stop = &args->hap->tref_stop, *tseq_stop = &args->hap->tseq_stop; while ( regitr_overlap(args->itr) ) { @@ -2553,12 +2848,12 @@ int test_cds_local(args_t *args, bcf1_t *rec) if ( !TSCRIPT_AUX(tr) ) { tr->aux = calloc(sizeof(tscript_t),1); - tscript_init_ref(args, tr, chr); + tscript_init_ref(args, tr, chr_fai); tscript_splice_ref(tr); khp_insert(trhp, args->active_tr, &tr); // only to clean the reference afterwards } - sanity_check_ref(args, tr, rec); + if ( sanity_check_ref(args, tr, rec)<0 ) continue; kstring_t sref; sref.s = TSCRIPT_AUX(tr)->sref; @@ -2594,40 +2889,44 @@ int test_cds_local(args_t *args, bcf1_t *rec) sseq.s = node.seq; int alen = sseq.l = strlen(sseq.s); int fill = node.dlen%3 && alen ? 1 : 0; // see #1475227917 - cds_translate(&sref, &sseq, node.sbeg,node.sbeg,node.sbeg+node.rlen, tr->strand, tseq, fill); + cds_translate(&sref, &sseq, node.sbeg,node.sbeg,node.sbeg+node.rlen, tr->strand, tseq, tseq_stop, fill); sseq.m = sref.m - 2*N_REF_PAD; sseq.s = sref.s + N_REF_PAD + node.sbeg; sseq.l = node.rlen; - cds_translate(&sref, &sseq, node.sbeg,node.sbeg,node.sbeg+node.rlen, tr->strand, tref, fill); + cds_translate(&sref, &sseq, node.sbeg,node.sbeg,node.sbeg+node.rlen, tr->strand, tref, tref_stop, fill); // check for truncating stops - for (j=0; jl; j++) - if ( tref->s[j]=='*' ) break; - if ( j!=tref->l ) + for (j=0; jl; j++) + if ( tref_stop->s[j]=='*' ) break; + if ( j!=tref_stop->l ) { tref->l = j+1; tref->s[j+1] = 0; + tref_stop->l = j+1; + tref_stop->s[j+1] = 0; } - for (j=0; jl; j++) - if ( tseq->s[j]=='*' ) break; + for (j=0; jl; j++) + if ( tseq_stop->s[j]=='*' ) break; if ( j!=tseq->l ) { tseq->l = j+1; tseq->s[j+1] = 0; + tseq_stop->l = j+1; + tseq_stop->s[j+1] = 0; } if ( csq_type & CSQ_STOP_LOST ) { - if ( tref->s[tref->l-1]=='*' && tref->s[tref->l-1] == tseq->s[tseq->l-1] ) + if ( tref_stop->s[tref_stop->l-1]=='*' && tref_stop->s[tref_stop->l-1] == tseq_stop->s[tseq_stop->l-1] ) { csq_type &= ~CSQ_STOP_LOST; csq_type |= CSQ_STOP_RETAINED; } - else if (tref->s[tref->l-1]!='*' ) + else if (tref_stop->s[tref_stop->l-1]!='*' ) { // This is CDS 3' incomplete ENSG00000173376/synon.vcf, can also be missense // We observe in real data a change to a stop, ENST00000528237/retained-stop-incomplete-cds.vcf - if ( tseq->s[tseq->l-1] == '*' ) + if ( tseq_stop->s[tseq_stop->l-1] == '*' ) { csq_type &= ~CSQ_STOP_GAINED; csq_type |= CSQ_STOP_RETAINED; @@ -2636,7 +2935,7 @@ int test_cds_local(args_t *args, bcf1_t *rec) csq_type |= CSQ_INCOMPLETE_CDS; } } - if ( csq_type & CSQ_START_LOST && tref->s[0]!='M' ) + if ( csq_type & CSQ_START_LOST && tref_stop->s[0]!='M' ) csq_type &= ~CSQ_START_LOST; if ( node.dlen!=0 ) { @@ -2646,8 +2945,20 @@ int test_cds_local(args_t *args, bcf1_t *rec) csq_type |= CSQ_INFRAME_DELETION; else csq_type |= CSQ_INFRAME_INSERTION; - if ( tref->s[tref->l-1]!='*' && tseq->s[tseq->l-1]=='*' ) + if ( tref_stop->s[tref_stop->l-1]!='*' && tseq_stop->s[tseq_stop->l-1]=='*' ) csq_type |= CSQ_STOP_GAINED; + if ( csq_type & CSQ_START_LOST && csq_type & CSQ_FRAMESHIFT_VARIANT ) + { + // this is to prevent + // 4959 GA G start_lost|NBPF3|ENST00000318249|protein_coding|+ + // 4959 GA G start_lost|NBPF3|ENST00000318249|protein_coding|+|1M>1?|4959GA>G + csq_type &= ~CSQ_FRAMESHIFT_VARIANT; + node.type = HAP_SSS; + csq_stage(args, &csq, rec); + free(node.seq); + free(node.var); + continue; + } } else { @@ -2656,9 +2967,9 @@ int test_cds_local(args_t *args, bcf1_t *rec) { if ( tref->s[j] == tseq->s[j] ) continue; aa_change = 1; - if ( tref->s[j] == '*' ) + if ( tref_stop->s[j] == '*' ) csq_type |= CSQ_STOP_LOST; - else if ( tseq->s[j] == '*' ) + else if ( tseq_stop->s[j] == '*' ) csq_type |= CSQ_STOP_GAINED; else csq_type |= CSQ_MISSENSE_VARIANT; @@ -2674,12 +2985,12 @@ int test_cds_local(args_t *args, bcf1_t *rec) int aa_sbeg = tr->strand==STRAND_FWD ? node.sbeg/3+1 : (TSCRIPT_AUX(tr)->nsref - 2*N_REF_PAD + node.dlen - node.sbeg - alen)/3+1; kputc_('|', &str); kputw(aa_rbeg, &str); - kprint_aa_prediction(args,aa_rbeg,tref,&str); + kprint_aa_prediction(args,aa_rbeg,tref,tref_stop,&str); if ( !(csq_type & CSQ_SYNONYMOUS_VARIANT) ) { kputc_('>', &str); kputw(aa_sbeg, &str); - kprint_aa_prediction(args,aa_sbeg,tseq,&str); + kprint_aa_prediction(args,aa_sbeg,tseq,tseq_stop,&str); } kputc_('|', &str); kputw(rec->pos+1, &str); @@ -2715,9 +3026,11 @@ int test_cds(args_t *args, bcf1_t *rec, vbuf_t *vbuf) static int overlaps_warned = 0, multiploid_warned = 0; int i, ret = 0, hap_ret; - const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,rec)); + const char *chr_vcf = bcf_seqname(args->hdr,rec); + const char *chr_gff = unify_chr_name(args, chr_vcf, CHR_VCF,CHR_GFF); + const char *chr_fai = unify_chr_name(args, chr_vcf, CHR_VCF,CHR_FAI); // note that the off-by-one extension of rlen is deliberate to account for insertions - if ( !regidx_overlap(args->idx_cds,chr,rec->pos,rec->pos+rec->rlen, args->itr) ) return 0; + if ( !regidx_overlap(args->idx_cds,chr_gff,rec->pos,rec->pos+rec->rlen, args->itr) ) return 0; while ( regitr_overlap(args->itr) ) { gf_cds_t *cds = regitr_payload(args->itr,gf_cds_t*); @@ -2729,7 +3042,7 @@ int test_cds(args_t *args, bcf1_t *rec, vbuf_t *vbuf) { // initialize the transcript and its haplotype tree, fetch the reference sequence tr->aux = calloc(sizeof(tscript_t),1); - tscript_init_ref(args, tr, chr); + tscript_init_ref(args, tr, chr_fai); TSCRIPT_AUX(tr)->root = (hap_node_t*) calloc(1,sizeof(hap_node_t)); TSCRIPT_AUX(tr)->nhap = args->phase==PHASE_DROP_GT ? 1 : 2*args->smpl->n; // maximum ploidy = diploid @@ -2741,7 +3054,7 @@ int test_cds(args_t *args, bcf1_t *rec, vbuf_t *vbuf) khp_insert(trhp, args->active_tr, &tr); } - sanity_check_ref(args, tr, rec); + if ( sanity_check_ref(args, tr, rec)<0 ) continue; if ( args->phase==PHASE_DROP_GT ) { @@ -2758,13 +3071,13 @@ int test_cds(args_t *args, bcf1_t *rec, vbuf_t *vbuf) { fprintf(stderr, "Warning: Skipping overlapping variants at %s:%"PRId64"\t%s>%s.\n", - chr,(int64_t) rec->pos+1,rec->d.allele[0],rec->d.allele[1]); + chr_vcf,(int64_t) rec->pos+1,rec->d.allele[0],rec->d.allele[1]); if ( !overlaps_warned ) - fprintf(stderr," This message is printed only once, the verbosity can be increased with `--verbose 2`\n"); + fprintf(stderr," This message is printed only once, the verbosity can be increased with `--verbosity 2`\n"); overlaps_warned = 1; } if ( args->out ) - fprintf(args->out,"LOG\tWarning: Skipping overlapping variants at %s:%"PRId64"\t%s>%s\n", chr,(int64_t) rec->pos+1,rec->d.allele[0],rec->d.allele[1]); + fprintf(args->out,"LOG\tWarning: Skipping overlapping variants at %s:%"PRId64"\t%s>%s\n", chr_vcf,(int64_t) rec->pos+1,rec->d.allele[0],rec->d.allele[1]); } else ret = 1; // prevent reporting as intron in test_tscript hap_destroy(child); @@ -2805,13 +3118,13 @@ int test_cds(args_t *args, bcf1_t *rec, vbuf_t *vbuf) { fprintf(stderr, "Warning: Skipping site with non-diploid/non-haploid genotypes at %s:%"PRId64"\t%s>%s.\n", - chr,(int64_t) rec->pos+1,rec->d.allele[0],rec->d.allele[1]); + chr_vcf,(int64_t) rec->pos+1,rec->d.allele[0],rec->d.allele[1]); if ( !multiploid_warned ) - fprintf(stderr," This message is printed only once, the verbosity can be increased with `--verbose 2`\n"); + fprintf(stderr," This message is printed only once, the verbosity can be increased with `--verbosity 2`\n"); multiploid_warned = 1; } if ( args->out ) - fprintf(args->out,"LOG\tWarning: Skipping site with non-diploid/non-haploid genotypes at %s:%"PRId64"\t%s>%s\n", chr,(int64_t) rec->pos+1,rec->d.allele[0],rec->d.allele[1]); + fprintf(args->out,"LOG\tWarning: Skipping site with non-diploid/non-haploid genotypes at %s:%"PRId64"\t%s>%s\n", chr_vcf,(int64_t) rec->pos+1,rec->d.allele[0],rec->d.allele[1]); continue; } for (ismpl=0; ismplsmpl->n; ismpl++) @@ -2828,7 +3141,7 @@ int test_cds(args_t *args, bcf1_t *rec, vbuf_t *vbuf) if ( !bcf_gt_is_phased(gt[0]) && !bcf_gt_is_phased(gt[1]) ) { if ( args->phase==PHASE_REQUIRE ) - error("Unphased heterozygous genotype at %s:%"PRId64", sample %s. See the --phase option.\n", chr,(int64_t) rec->pos+1,args->hdr->samples[args->smpl->idx[ismpl]]); + error("Unphased heterozygous genotype at %s:%"PRId64", sample %s. See the --phase option.\n", chr_vcf,(int64_t) rec->pos+1,args->hdr->samples[args->smpl->idx[ismpl]]); if ( args->phase==PHASE_SKIP ) continue; if ( args->phase==PHASE_NON_REF ) @@ -2871,14 +3184,14 @@ int test_cds(args_t *args, bcf1_t *rec, vbuf_t *vbuf) { fprintf(stderr, "Warning: Skipping overlapping variants at %s:%"PRId64", sample %s\t%s>%s.\n", - chr,(int64_t) rec->pos+1,args->hdr->samples[args->smpl->idx[ismpl]],rec->d.allele[0],rec->d.allele[ial]); + chr_vcf,(int64_t) rec->pos+1,args->hdr->samples[args->smpl->idx[ismpl]],rec->d.allele[0],rec->d.allele[ial]); if ( !overlaps_warned ) - fprintf(stderr," This message is printed only once, the verbosity can be increased with `--verbose 2`\n"); + fprintf(stderr," This message is printed only once, the verbosity can be increased with `--verbosity 2`\n"); overlaps_warned = 1; } if ( args->out ) fprintf(args->out,"LOG\tWarning: Skipping overlapping variants at %s:%"PRId64", sample %s\t%s>%s\n", - chr,(int64_t) rec->pos+1,args->hdr->samples[args->smpl->idx[ismpl]],rec->d.allele[0],rec->d.allele[ial]); + chr_vcf,(int64_t) rec->pos+1,args->hdr->samples[args->smpl->idx[ismpl]],rec->d.allele[0],rec->d.allele[ial]); } hap_destroy(child); continue; @@ -2990,9 +3303,10 @@ void csq_stage(args_t *args, csq_t *csq, bcf1_t *rec) } int test_utr(args_t *args, bcf1_t *rec) { - const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,rec)); + const char *chr_vcf = bcf_seqname(args->hdr,rec); + const char *chr_gff = unify_chr_name(args, chr_vcf, CHR_VCF,CHR_GFF); // note that the off-by-one extension of rlen is deliberate to account for insertions - if ( !regidx_overlap(args->idx_utr,chr,rec->pos,rec->pos+rec->rlen, args->itr) ) return 0; + if ( !regidx_overlap(args->idx_utr,chr_gff,rec->pos,rec->pos+rec->rlen, args->itr) ) return 0; splice_t splice; splice_init(&splice, rec); @@ -3028,8 +3342,9 @@ int test_utr(args_t *args, bcf1_t *rec) } int test_splice(args_t *args, bcf1_t *rec) { - const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,rec)); - if ( !regidx_overlap(args->idx_exon,chr,rec->pos,rec->pos + rec->rlen, args->itr) ) return 0; + const char *chr_vcf = bcf_seqname(args->hdr,rec); + const char *chr_gff = unify_chr_name(args, chr_vcf, CHR_VCF,CHR_GFF); + if ( !regidx_overlap(args->idx_exon,chr_gff,rec->pos,rec->pos + rec->rlen, args->itr) ) return 0; splice_t splice; splice_init(&splice, rec); @@ -3060,8 +3375,9 @@ int test_splice(args_t *args, bcf1_t *rec) } int test_tscript(args_t *args, bcf1_t *rec) { - const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,rec)); - if ( !regidx_overlap(args->idx_tscript,chr,rec->pos,rec->pos+rec->rlen, args->itr) ) return 0; + const char *chr_vcf = bcf_seqname(args->hdr,rec); + const char *chr_gff = unify_chr_name(args, chr_vcf, CHR_VCF,CHR_GFF); + if ( !regidx_overlap(args->idx_tscript,chr_gff,rec->pos,rec->pos+rec->rlen, args->itr) ) return 0; splice_t splice; splice_init(&splice, rec); @@ -3103,7 +3419,8 @@ void test_symbolic_alt(args_t *args, bcf1_t *rec) warned = 1; } - const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,rec)); + const char *chr_vcf = bcf_seqname(args->hdr,rec); + const char *chr_gff = unify_chr_name(args, chr_vcf, CHR_VCF,CHR_GFF); // only insertions atm int beg = rec->pos + 1; @@ -3111,7 +3428,7 @@ void test_symbolic_alt(args_t *args, bcf1_t *rec) int csq_class = CSQ_ELONGATION; int hit = 0; - if ( regidx_overlap(args->idx_cds,chr,beg,end, args->itr) ) + if ( regidx_overlap(args->idx_cds,chr_gff,beg,end, args->itr) ) { while ( regitr_overlap(args->itr) ) { @@ -3129,7 +3446,7 @@ void test_symbolic_alt(args_t *args, bcf1_t *rec) hit = 1; } } - if ( regidx_overlap(args->idx_utr,chr,beg,end, args->itr) ) + if ( regidx_overlap(args->idx_utr,chr_gff,beg,end, args->itr) ) { while ( regitr_overlap(args->itr) ) { @@ -3147,7 +3464,7 @@ void test_symbolic_alt(args_t *args, bcf1_t *rec) hit = 1; } } - if ( regidx_overlap(args->idx_exon,chr,beg,end, args->itr) ) + if ( regidx_overlap(args->idx_exon,chr_gff,beg,end, args->itr) ) { splice_t splice; splice_init(&splice, rec); @@ -3166,7 +3483,7 @@ void test_symbolic_alt(args_t *args, bcf1_t *rec) if ( splice.csq ) hit = 1; } } - if ( !hit && regidx_overlap(args->idx_tscript,chr,beg,end, args->itr) ) + if ( !hit && regidx_overlap(args->idx_tscript,chr_gff,beg,end, args->itr) ) { splice_t splice; splice_init(&splice, rec); @@ -3227,6 +3544,7 @@ static void process(args_t *args, bcf1_t **rec_ptr) bcf1_t *rec = *rec_ptr; static int32_t prev_rid = -1, prev_pos = -1; + const char *chr_vcf = bcf_seqname(args->hdr,rec); if ( prev_rid!=rec->rid ) { prev_rid = rec->rid; @@ -3235,14 +3553,28 @@ static void process(args_t *args, bcf1_t **rec_ptr) // Common error is to use different naming conventions in the fasta and the VCF (e.g. X vs chrX). // Perform a simple sanity check (that does not catch much), the chromosome must be present in the // reference file - if ( !faidx_has_seq(args->fai,bcf_seqname(args->hdr,rec)) ) + const char *chr_fai = unify_chr_name(args, chr_vcf, CHR_VCF,CHR_FAI); + if ( !faidx_has_seq(args->fai,chr_fai) ) { - if ( !faidx_has_seq(args->fai,drop_chr_prefix(args,bcf_seqname(args->hdr,rec))) && !faidx_has_seq(args->fai,add_chr_prefix(args,bcf_seqname(args->hdr,rec))) ) - error("Error: the chromosome \"%s\" is not present in %s\n",bcf_seqname(args->hdr,rec),args->fa_fname); + static int missing_chr_fai_warned = 0; + if ( !args->force ) + error("Error: the chromosome \"%s\" is not present in %s\n %s\n",chr_fai,args->fa_fname,args->unify_chr_names_err); + else if ( !missing_chr_fai_warned++ ) + fprintf(stderr,"Warning: the chromosome \"%s\" is not present in %s. This warning is printed only once.\n",chr_fai,args->fa_fname); + } + + const char *chr_gff = unify_chr_name(args, chr_vcf, CHR_VCF,CHR_GFF); + if ( !gff_has_seq(args->gff,chr_gff) ) + { + static int missing_chr_gff_warned = 0; + if ( !args->force ) + error("Error: the chromosome \"%s\" is not present in %s\n %s\n",chr_gff,args->gff_fname,args->unify_chr_names_err); + else if ( !missing_chr_gff_warned++ ) + fprintf(stderr,"Warning: the chromosome \"%s\" is not present in %s. This warning is printed only once.\n",chr_gff,args->gff_fname); } } if ( prev_pos > rec->pos ) - error("Error: The file is not sorted, %s:%d comes before %s:%"PRId64"\n",bcf_seqname(args->hdr,rec),prev_pos+1,bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); + error("Error: The file is not sorted, %s:%d comes before %s:%"PRId64"\n",chr_vcf,prev_pos+1,bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); int call_csq = 1; if ( rec->n_allele < 2 ) call_csq = 0; // no alternate allele @@ -3305,6 +3637,7 @@ static const char *usage(void) "\n" "CSQ options:\n" " -B, --trim-protein-seq INT Abbreviate protein-changing predictions to max INT aminoacids\n" + " -C, --genetic-code INT|l Specify the genetic code table to use, 'l' to print a list [0]\n" " -c, --custom-tag STRING Use this tag instead of the default BCSQ\n" " -l, --local-csq Localized predictions, consider only one VCF record at a time\n" " -n, --ncsq INT Maximum number of per-haplotype consequences to consider for each site [15]\n" @@ -3317,7 +3650,8 @@ static const char *usage(void) "GFF options:\n" " --dump-gff FILE.gz Dump the parsed GFF file (for debugging purposes)\n" " --force Run even if some sanity checks fail\n" - " --unify-chr-names 1|0 Automatically unify chromosome naming (e.g. chrX vs X) in GFF, fasta, and VCF [1]\n" + " --unify-chr-names 0|LIST Unify chromosome naming by stripping a prefix in VCF,GFF,fasta, respectively [0]\n" + " (e.g., \"chr,Chr,-\" trims \"chr\" in VCF and \"Chr\" in GFF, fasta is unchanged)\n" "General options:\n" " -e, --exclude EXPR Exclude sites for which the expression is true\n" " -i, --include EXPR Select sites for which the expression is true\n" @@ -3334,7 +3668,7 @@ static const char *usage(void) " -T, --targets-file FILE Similar to -R but streams rather than index-jumps\n" " --targets-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n" " --threads INT Use multithreading with worker threads [0]\n" - " -v, --verbose INT Verbosity level 0-2 [1]\n" + " -v, --verbosity INT Verbosity level 0-6 [1]\n" " -W, --write-index[=FMT] Automatically index the output files [off]\n" "\n" "Example:\n" @@ -3356,11 +3690,11 @@ int main_csq(int argc, char *argv[]) args->verbosity = 1; args->record_cmd_line = 1; args->clevel = -1; - args->unify_chr_names = 1; static struct option loptions[] = { {"force",0,0,1}, + {"genetic-code",required_argument,NULL,'C'}, {"threads",required_argument,NULL,2}, {"help",0,0,'h'}, {"ncsq",1,0,'n'}, @@ -3377,6 +3711,7 @@ int main_csq(int argc, char *argv[]) {"phase",1,0,'p'}, {"quiet",0,0,'q'}, {"verbose",1,0,'v'}, + {"verbosity",1,0,'v'}, {"regions",1,0,'r'}, {"regions-file",1,0,'R'}, {"regions-overlap",required_argument,NULL,4}, @@ -3395,7 +3730,7 @@ int main_csq(int argc, char *argv[]) int regions_overlap = 1; int targets_overlap = 0; char *targets_list = NULL, *regions_list = NULL, *tmp; - while ((c = getopt_long(argc, argv, "?hr:R:t:T:i:e:f:o:O:g:s:S:p:qc:ln:bB:v:W::",loptions,NULL)) >= 0) + while ((c = getopt_long(argc, argv, "?hr:R:t:T:i:e:f:o:O:g:s:S:p:qc:C:ln:bB:v:W::",loptions,NULL)) >= 0) { switch (c) { @@ -3414,11 +3749,13 @@ int main_csq(int argc, char *argv[]) if ( *tmp || args->brief_predictions<1 ) error("Could not parse argument: --trim-protein-seq %s\n", optarg); break; case 'l': args->local_csq = 1; break; + case 'C': args->gencode_str = optarg; break; case 'c': args->bcsq_tag = optarg; break; case 'q': error("Error: the -q option has been deprecated, use -v, --verbose instead.\n"); break; case 'v': args->verbosity = atoi(optarg); - if ( args->verbosity<0 || args->verbosity>2 ) error("Error: expected integer 0-2 with -v, --verbose\n"); + if ( args->verbosity<0 ) error("Error: expected integer with -v, --verbosity\n"); + if ( args->verbosity > 3 ) hts_verbose = args->verbosity; break; case 'p': switch (optarg[0]) @@ -3482,16 +3819,14 @@ int main_csq(int argc, char *argv[]) error("Unsupported index format '%s'\n", optarg); break; case 7 : args->dump_gff = optarg; break; - case 8 : - if ( !strcmp(optarg,"0") ) args->unify_chr_names = 0; - else if ( !strcmp(optarg,"1") ) args->unify_chr_names = 1; - else error("Could not parse: --unify-chr-names %s\n",optarg); - break; + case 8 : args->unify_chr_names = optarg; break; case 'h': case '?': error("%s",usage()); default: error("The option not recognised: %s\n\n", optarg); break; } } + init_gencode(args); + char *fname = NULL; if ( optind==argc ) { diff --git a/bcftools/csq.c.pysam.c b/bcftools/csq.c.pysam.c index 3f482fdf..dfe1e890 100644 --- a/bcftools/csq.c.pysam.c +++ b/bcftools/csq.c.pysam.c @@ -2,7 +2,7 @@ /* The MIT License - Copyright (c) 2016-2024 Genome Research Ltd. + Copyright (c) 2016-2025 Genome Research Ltd. Author: Petr Danecek @@ -220,6 +220,10 @@ #define CSQ_PRN_NMD (~(CSQ_INTRON|CSQ_NON_CODING)) #define CSQ_PRN_BIOTYPE CSQ_NON_CODING +#define CHR_VCF 0 +#define CHR_GFF 1 +#define CHR_FAI 2 + // see kput_vcsq() const char *csq_strings[] = { @@ -369,15 +373,24 @@ typedef struct { int mstack; hstack_t *stack; - gf_tscript_t *tr; // tr->ref: spliced transcript on ref strand - kstring_t sseq; // spliced haplotype sequence on ref strand - kstring_t tseq; // the variable part of translated haplotype transcript, coding strand - kstring_t tref; // the variable part of translated reference transcript, coding strand - uint32_t sbeg; // stack's sbeg, for cases first node's type is HAP_SSS + gf_tscript_t *tr; // tr->ref: spliced transcript on ref strand + kstring_t sseq; // spliced haplotype sequence on ref strand + kstring_t tseq; // the variable part of translated haplotype transcript, coding strand + kstring_t tref; // the variable part of translated reference transcript, coding strand + kstring_t tseq_stop; // the stop/start codons in tseq and tref + kstring_t tref_stop; // + uint32_t sbeg; // stack's sbeg, for cases first node's type is HAP_SSS int upstream_stop; } hap_t; +typedef struct +{ + int id; + const char *name, *code, *stop; +} +gencode_t; + typedef struct _args_t { // the main regidx lookups, from chr:beg-end to overlapping features and @@ -415,14 +428,18 @@ typedef struct _args_t int ncsq2_max, nfmt_bcsq; // maximum number of csq per site that can be accessed from FORMAT/BCSQ (*2 and 1 bit skipped to avoid BCF missing values) int ncsq2_small_warned; int brief_predictions; - int unify_chr_names; - char *chr_name; + char *unify_chr_names; // e.g. chr,Chromosome,-; prefixes in VCF,GFF,fasta + char *unify_chr_names_err; + char *chr_prefix[3]; // chr prefix to trim in VCF,GFF,fasta. See also CHR_VCF,CHR_GFF,CHR_FAI + char *chr_name, *chr_names[3]; int mchr_name; struct { int unknown_chr,unknown_tscript_biotype,unknown_strand,unknown_phase,duplicate_id; - int unknown_cds_phase,incomplete_cds,wrong_phase,overlapping_cds; + int unknown_cds_phase,incomplete_cds,wrong_phase,overlapping_cds,ref_allele_mismatch; } warned; + char *gencode_str; // which genetic code table to use + gencode_t *gencode; // genetic code table int rid; // current chromosome tr_heap_t *active_tr; // heap of active transcripts for quick flushing hap_t *hap; // transcript haplotype recursion @@ -442,8 +459,94 @@ typedef struct _args_t } args_t; +// Generated with misc/gencode-tables // AAA, AAC, ... -const char *gencode = "KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSSS*CWCLFLF"; +gencode_t gencode_tables[] = +{ + {.id=0, .name="Standard sipmlified", + .code="KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSSS*CWCLFLF", + .stop="--------------M---------------------------------*-*-----*-------" }, + {.id=1, .name="Standard", + .code="KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSSS*CWCLFLF", + .stop="--------------M---------------M-----------------*-*-----*-----M-" }, + {.id=2, .name="Vertebrate Mitochondrial", + .code="KNKNTTTT*S*SMIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSSSWCWCLFLF", + .stop="--------*-*-MMMM------------------------------M-*-*-------------" }, + {.id=3, .name="Yeast Mitochondrial", + .code="KNKNTTTTRSRSMIMIQHQHPPPPRRRRTTTTEDEDAAAAGGGGVVVV*Y*YSSSSWCWCLFLF", + .stop="------------M-M-------------------------------M-*-*-------------" }, + {.id=4, .name="Mold Mitochondrial; Protozoan Mitochondrial; Coelenterate Mitochondrial; Mycoplasma; Spiroplasma", + .code="KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSSSWCWCLFLF", + .stop="------------MMMM--------------M---------------M-*-*---------M-M-" }, + {.id=5, .name="Invertebrate Mitochondrial", + .code="KNKNTTTTSSSSMIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSSSWCWCLFLF", + .stop="------------MMMM------------------------------M-*-*-----------M-" }, + {.id=6, .name="Ciliate Nuclear; Dasycladacean Nuclear; Hexamita Nuclear", + .code="KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVVQYQYSSSS*CWCLFLF", + .stop="--------------M-----------------------------------------*-------" }, + {.id=9, .name="Echinoderm Mitochondrial; Flatworm Mitochondrial", + .code="NNKNTTTTSSSSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSSSWCWCLFLF", + .stop="--------------M-------------------------------M-*-*-------------" }, + {.id=10, .name="Euplotid Nuclear", + .code="KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSSSCCWCLFLF", + .stop="--------------M---------------------------------*-*-------------" }, + {.id=11, .name="Bacterial, Archaeal and Plant Plastid", + .code="KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSSS*CWCLFLF", + .stop="------------MMMM--------------M---------------M-*-*-----*-----M-" }, + {.id=12, .name="Alternative Yeast Nuclear", + .code="KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLSLEDEDAAAAGGGGVVVV*Y*YSSSS*CWCLFLF", + .stop="--------------M---------------M-----------------*-*-----*-------" }, + {.id=13, .name="Ascidian Mitochondrial", + .code="KNKNTTTTGSGSMIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSSSWCWCLFLF", + .stop="------------M-M-------------------------------M-*-*-----------M-" }, + {.id=14, .name="Alternative Flatworm Mitochondrial", + .code="NNKNTTTTSSSSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVVYY*YSSSSWCWCLFLF", + .stop="--------------M-----------------------------------*-------------" }, + {.id=15, .name="Blepharisma Nuclear", + .code="KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*YQYSSSS*CWCLFLF", + .stop="--------------M---------------------------------*-------*-------" }, + {.id=16, .name="Chlorophycean Mitochondrial", + .code="KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*YLYSSSS*CWCLFLF", + .stop="--------------M---------------------------------*-------*-------" }, + {.id=21, .name="Trematode Mitochondrial", + .code="NNKNTTTTSSSSMIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSSSWCWCLFLF", + .stop="--------------M-------------------------------M-*-*-------------" }, + {.id=22, .name="Scenedesmus obliquus Mitochondrial Code", + .code="KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*YLY*SSS*CWCLFLF", + .stop="--------------M---------------------------------*---*---*-------" }, + {.id=23, .name="Thraustochytrium mitochondrial code", + .code="KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSSS*CWC*FLF", + .stop="--------------MM------------------------------M-*-*-----*---*---" }, + {.id=24, .name="Pterobranchia Mitochondrial", + .code="KNKNTTTTSSKSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSSSWCWCLFLF", + .stop="--------------M---------------M---------------M-*-*-----------M-" }, + {.id=25, .name="Candidate Division SR1 and Gracilibacteria", + .code="KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSSSGCWCLFLF", + .stop="--------------M-------------------------------M-*-*-----------M-" }, + {.id=26, .name="Pachysolen tannophilus Nuclear Code", + .code="KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLALEDEDAAAAGGGGVVVV*Y*YSSSS*CWCLFLF", + .stop="--------------M---------------M-----------------*-*-----*-------" }, + {.id=27, .name="Karyorelict Nuclear", + .code="KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVVQYQYSSSSWCWCLFLF", + .stop="--------------M-----------------------------------------*-------" }, + {.id=28, .name="Condylostoma Nuclear", + .code="KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVVQYQYSSSSWCWCLFLF", + .stop="--------------M---------------------------------*-*-----*-------" }, + {.id=29, .name="Mesodinium Nuclear", + .code="KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVVYYYYSSSS*CWCLFLF", + .stop="--------------M-----------------------------------------*-------" }, + {.id=30, .name="Peritrich Nuclear", + .code="KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVVEYEYSSSS*CWCLFLF", + .stop="--------------M-----------------------------------------*-------" }, + {.id=31, .name="Blastocrithidia Nuclear", + .code="KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVVEYEYSSSSWCWCLFLF", + .stop="--------------M---------------------------------*-*-------------" }, + {.id=33, .name="Cephalodiscidae Mitochondrial UAA-Tyr", + .code="KNKNTTTTSSKSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVVYY*YSSSSWCWCLFLF", + .stop="--------------M---------------M---------------M---*-----------M-" }, + {.id=-1, .name=NULL, .code=NULL, .stop=NULL} +}; +gencode_t *gencode = NULL; const uint8_t nt4[] = { 4,4,4,4, 4,4,4,4, 4,4,4,4, 4,4,4,4, @@ -466,8 +569,10 @@ const uint8_t cnt4[] = 4,3,4,2, 4,4,4,1, 4,4,4,4, 4,4,4,4, 4,4,4,4, 0 }; -#define dna2aa(x) gencode[ nt4[(uint8_t)(x)[0]]<<4 | nt4[(uint8_t)(x)[1]]<<2 | nt4[(uint8_t)(x)[2]] ] -#define cdna2aa(x) gencode[ cnt4[(uint8_t)(x)[2]]<<4 | cnt4[(uint8_t)(x)[1]]<<2 | cnt4[(uint8_t)(x)[0]] ] +#define dna2aa(x) gencode->code[ nt4[(uint8_t)(x)[0]]<<4 | nt4[(uint8_t)(x)[1]]<<2 | nt4[(uint8_t)(x)[2]] ] +#define cdna2aa(x) gencode->code[ cnt4[(uint8_t)(x)[2]]<<4 | cnt4[(uint8_t)(x)[1]]<<2 | cnt4[(uint8_t)(x)[0]] ] +#define dna2stop(x) gencode->stop[ nt4[(uint8_t)(x)[0]]<<4 | nt4[(uint8_t)(x)[1]]<<2 | nt4[(uint8_t)(x)[2]] ] +#define cdna2stop(x) gencode->stop[ cnt4[(uint8_t)(x)[2]]<<4 | cnt4[(uint8_t)(x)[1]]<<2 | cnt4[(uint8_t)(x)[0]] ] static inline int ncsq2_to_nfmt(int ncsq2) { @@ -479,6 +584,111 @@ static inline void icsq2_to_bit(int icsq2, int *ival, int *ibit) *ibit = icsq2 % 30; } +static void init_gencode(args_t *args) +{ + int i,j,k; + if ( !args->gencode_str ) args->gencode_str = "0"; + if ( !strcasecmp("l",args->gencode_str) ) + { + fprintf(bcftools_stdout, "# The tables are ordered by codon as AAA,AAC,AAG,...,ACA,ACC,...,TTT:\n"); + fprintf(bcftools_stdout, "#\n#\t"); + for (i=0; i<4; i++) + { + for (j=0; j<16; j++) fprintf(bcftools_stdout, "%c","ACGT"[i]); + } + fprintf(bcftools_stdout, "\n#\t"); + for (i=0; i<4; i++) + { + for (j=0; j<4; j++) + { + for (k=0; k<4; k++) fprintf(bcftools_stdout, "%c","ACGT"[j]); + } + } + fprintf(bcftools_stdout, "\n#\t"); + for (i=0; i<16; i++) + { + for (k=0; k<4; k++) fprintf(bcftools_stdout, "%c","ACGT"[k]); + } + fprintf(bcftools_stdout, "\n#\n\n"); + + for (i=0; gencode_tables[i].id >= 0; i++) + { + gencode_t *gc = &gencode_tables[i]; + fprintf(bcftools_stdout, "%d\t%s\n\t%s\n\t%s\n\n",gc->id,gc->name,gc->code,gc->stop); + } + bcftools_exit(0); + } + char *tmp; + int id = strtol(args->gencode_str,&tmp,10); + if ( *tmp ) error("Could not parse argument: --genetic-code %s\n",args->gencode_str); + for (i=0; gencode_tables[i].id >= 0; i++) + { + gencode_t *gc = &gencode_tables[i]; + if ( gc->id==id ) + { + gencode = gc; + break; + } + } + if ( !gencode ) error("Could not parse argument: --genetic-code %s, no such table\n",args->gencode_str); +} +void test_prefix(kstring_t *str, const char *seq) +{ + if ( !strncasecmp(seq,"chromosome_",11) ) kputsn(seq,11,str); + else if ( !strncasecmp(seq,"chromosome",10) ) kputsn(seq,10,str); + else if ( !strncasecmp(seq,"chrom_",6) ) kputsn(seq,6,str); + else if ( !strncasecmp(seq,"chrom",5) ) kputsn(seq,5,str); + else if ( !strncasecmp(seq,"chr_",4) ) kputsn(seq,4,str); + else if ( !strncasecmp(seq,"chr",3) ) kputsn(seq,3,str); + else kputs("-",str); +} +void init_chr_names(args_t *args) +{ + // init chr prefixes to trim + int i,n; + char **tmp; + + // chr prefixes given explicitly + args->unify_chr_names_err = strdup("check if --unify-chr-names or --force could help"); + if ( args->unify_chr_names && (tmp=hts_readlist(args->unify_chr_names,0,&n)) ) + { + if ( n!=3 ) error("Error: expected three strings, got --unify-chr-names %s\n",args->unify_chr_names); + for (i=0; i<3; i++) + if ( strcmp("-",tmp[i]) ) args->chr_prefix[i] = tmp[i]; + else free(tmp[i]); + free(tmp); + return; + } + + int nseq; + const char **vcf = bcf_hdr_seqnames(args->hdr, &nseq); + if ( !vcf ) return; + const char *seq_vcf = vcf[0]; + const char *seq_gff = gff_iseq(args->gff,0); + const char *seq_fa = faidx_iseq(args->fai,0); + free(vcf); + if ( !strcmp(seq_vcf,seq_fa) && !strcmp(seq_vcf,seq_gff) ) return; + + // First sequences not identical: either they have different prefix or they are in different order. + // See if we can suggest the --unify-chr-names parameter to use + kstring_t chr_vcf = {0,0,0}, chr_gff = {0,0,0}, chr_fa = {0,0,0}, str = {0,0,0}; + test_prefix(&chr_vcf, seq_vcf); + test_prefix(&chr_gff, seq_gff); + test_prefix(&chr_fa, seq_fa); + int same_chr = 1; + if ( strcmp(!strcmp("-",chr_vcf.s)?seq_vcf:seq_vcf+chr_vcf.l,!strcmp("-",chr_gff.s)?seq_gff:seq_gff+chr_gff.l) ) same_chr = 0; + if ( strcmp(!strcmp("-",chr_gff.s)?seq_gff:seq_gff+chr_gff.l,!strcmp("-",chr_fa.s)?seq_fa:seq_fa+chr_fa.l) ) same_chr = 0; + if ( strcmp(!strcmp("-",chr_fa.s)?seq_fa:seq_fa+chr_fa.l,!strcmp("-",chr_vcf.s)?seq_vcf:seq_vcf+chr_vcf.l) ) same_chr = 0; + free(args->unify_chr_names_err); + if ( same_chr ) + ksprintf(&str,"the first sequence name in VCF/GFF/fasta is %s/%s/%s, try to run with --unify-chr-names %s,%s,%s\n",seq_vcf,seq_gff,seq_fa,chr_vcf.s,chr_gff.s,chr_fa.s); + else + ksprintf(&str,"the first sequence name in VCF/GFF/fasta is %s/%s/%s, check if running with --unify-chr-names or --force coud help\n",seq_vcf,seq_gff,seq_fa); + free(chr_vcf.s); + free(chr_gff.s); + free(chr_fa.s); + args->unify_chr_names_err = str.s; +} void init_data(args_t *args) { args->nfmt_bcsq = ncsq2_to_nfmt(args->ncsq2_max); @@ -488,7 +698,6 @@ void init_data(args_t *args) args->gff = gff_init(args->gff_fname); gff_set(args->gff,verbosity,args->verbosity); - gff_set(args->gff,strip_chr_names,args->unify_chr_names); gff_set(args->gff,force_out_of_phase,args->force); gff_set(args->gff,dump_fname,args->dump_gff); gff_parse(args->gff); @@ -498,6 +707,8 @@ void init_data(args_t *args) args->idx_tscript = gff_get(args->gff,idx_tscript); args->itr = regitr_init(NULL); + init_chr_names(args); + args->rid = -1; if ( args->filter_str ) @@ -581,6 +792,13 @@ void destroy_data(args_t *args) kh_destroy(pos2vbuf,args->pos2vbuf); if ( args->smpl ) smpl_ilist_destroy(args->smpl); int i,j,ret; + for (i=0; i<3; i++) + { + free(args->chr_prefix[i]); + free(args->chr_names[i]); + } + free(args->chr_name); + if ( args->out_fh ) { if ( args->write_index ) @@ -619,12 +837,14 @@ void destroy_data(args_t *args) free(args->hap->sseq.s); free(args->hap->tseq.s); free(args->hap->tref.s); + free(args->hap->tseq_stop.s); + free(args->hap->tref_stop.s); free(args->hap); fai_destroy(args->fai); free(args->gt_arr); free(args->str.s); free(args->str2.s); - free(args->chr_name); + free(args->unify_chr_names_err); } /* @@ -668,6 +888,7 @@ void splice_init(splice_t *splice, bcf1_t *rec) } static inline void splice_build_hap(splice_t *splice, uint32_t beg, int len) { + // beg .. the beggining of the splice region // len>0 .. beg is the first base, del filled from right // len<0 .. beg is the last base, del filled from left @@ -683,8 +904,24 @@ static inline void splice_build_hap(splice_t *splice, uint32_t beg, int len) } else { - rbeg = abeg = beg; - rlen = alen = len; + if ( beg < splice->tr->beg ) + { + // This can happen with very short exons and introns. Not a real biology, but the program + // should not crash on it. This is not a real fix, the code would need a revamp to handle + // well cases like this, see test/csq/ENSCAFT00000047742 + // >chr9:104-110 + // ATGTCAGGGCC + // ATGTC-GGGCC + // 456 + // eee.eee + rbeg = abeg = splice->tr->beg; + rlen = alen = 0; + } + else + { + rbeg = abeg = beg; + rlen = alen = len; + } // check for incomplete del as above?? } @@ -810,20 +1047,32 @@ fprintf(bcftools_stderr,"csq_stage_splice %d: type=%d\n",(int)rec->pos+1,type); csq.type.gene = tr->gene->name; csq_stage(args, &csq, rec); } -static inline const char *drop_chr_prefix(args_t *args, const char *chr) +static inline const char *unify_chr_name(args_t *args, const char *chr, int isrc, int idst) { - if ( !args->unify_chr_names ) return chr; - if ( !strncasecmp("chr",chr,3) ) return chr+3; - return chr; -} -static inline const char *add_chr_prefix(args_t *args, const char *chr) -{ - if ( !args->unify_chr_names ) return chr; - int len = strlen(chr); - hts_expand(char,len+4,args->mchr_name,args->chr_name); - memcpy(args->chr_name,"chr",3); - memcpy(args->chr_name+3,chr,len+1); - return args->chr_name; + if ( !args->chr_prefix[isrc] && !args->chr_prefix[idst] ) return chr; + + int off = 0, len = strlen(chr); + if ( args->chr_prefix[isrc] ) + { + off = strlen(args->chr_prefix[isrc]); + len -= off; + if ( strncmp(args->chr_prefix[isrc],chr,off) ) + error("Error: failed to unify chr names, cannot strip \"%s\" from \"%s\"\n",args->chr_prefix[isrc],chr); + } + hts_expand(char,len+1,args->mchr_name,args->chr_name); + memcpy(args->chr_name,chr+off,len+1); + + if ( args->chr_prefix[idst] ) + { + off = strlen(args->chr_prefix[idst]); + hts_expand(char,len+off+1,args->mchr_name,args->chr_name); + memmove(args->chr_name+off,args->chr_name,len+1); + memcpy(args->chr_name,args->chr_prefix[idst],off); + } + + free(args->chr_names[idst]); + args->chr_names[idst] = strdup(args->chr_name); + return args->chr_names[idst]; } static inline int splice_csq_ins(args_t *args, splice_t *splice, uint32_t ex_beg, uint32_t ex_end) { @@ -850,7 +1099,7 @@ fprintf(bcftools_stderr,"ins: %s>%s .. ex=%d,%d beg,end=%d,%d tbeg,tend=%d,%d if ( splice->check_utr ) { regitr_t *itr = regitr_init(NULL); - const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,splice->vcf.rec)); + const char *chr = unify_chr_name(args, bcf_seqname(args->hdr,splice->vcf.rec),CHR_VCF,CHR_GFF); if ( regidx_overlap(args->idx_utr,chr,splice->ref_beg+1,splice->ref_beg+1, itr) ) // adjacent utr { ret = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq, splice->vcf.ial); @@ -888,7 +1137,7 @@ fprintf(bcftools_stderr,"ins: %s>%s .. ex=%d,%d beg,end=%d,%d tbeg,tend=%d,%d if ( splice->check_utr ) { regitr_t *itr = regitr_init(NULL); - const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,splice->vcf.rec)); + const char *chr = unify_chr_name(args, bcf_seqname(args->hdr,splice->vcf.rec),CHR_VCF,CHR_GFF); if ( regidx_overlap(args->idx_utr,chr,splice->ref_end-1,splice->ref_end-1, itr) ) // adjacent utr { ret = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq, splice->vcf.ial); @@ -1067,7 +1316,7 @@ fprintf(bcftools_stderr,"splice_csq_del: %s>%s .. ex=%d,%d beg,end=%d,%d tbeg, if ( splice->check_utr ) { regitr_t *itr = regitr_init(NULL); - const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,splice->vcf.rec)); + const char *chr = unify_chr_name(args, bcf_seqname(args->hdr,splice->vcf.rec),CHR_VCF,CHR_GFF); if ( regidx_overlap(args->idx_utr,chr,splice->ref_beg,ex_beg-1, itr) ) // adjacent utr csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq, splice->vcf.ial); regitr_destroy(itr); @@ -1095,7 +1344,9 @@ fprintf(bcftools_stderr,"splice_csq_del: %s>%s .. ex=%d,%d beg,end=%d,%d tbeg, { if ( splice->check_donor && splice->tr->strand==STRAND_REV ) splice->csq |= CSQ_SPLICE_DONOR; if ( splice->check_acceptor && splice->tr->strand==STRAND_FWD ) splice->csq |= CSQ_SPLICE_ACCEPTOR; - if ( ref && alt && !strncmp(ref+N_SPLICE_REGION_INTRON-N_SPLICE_DONOR,alt+N_SPLICE_REGION_INTRON-N_SPLICE_DONOR,N_SPLICE_DONOR) ) splice->csq |= CSQ_SYNONYMOUS_VARIANT; + + int noff = N_SPLICE_REGION_INTRON - N_SPLICE_DONOR; + if ( ref && alt && noffkref.l && noffkalt.l && !strncmp(ref+noff,alt+noff,N_SPLICE_DONOR) ) splice->csq |= CSQ_SYNONYMOUS_VARIANT; } } } @@ -1123,7 +1374,7 @@ fprintf(bcftools_stderr,"splice_csq_del: %s>%s .. ex=%d,%d beg,end=%d,%d tbeg, if ( splice->check_utr ) { regitr_t *itr = regitr_init(NULL); - const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,splice->vcf.rec)); + const char *chr = unify_chr_name(args, bcf_seqname(args->hdr,splice->vcf.rec),CHR_VCF,CHR_GFF); if ( regidx_overlap(args->idx_utr,chr,ex_end+1,splice->ref_end, itr) ) // adjacent utr csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq, splice->vcf.ial); regitr_destroy(itr); @@ -1214,7 +1465,7 @@ fprintf(bcftools_stderr,"mnp: %s>%s .. ex=%d,%d beg,end=%d,%d tbeg,tend=%d,%d if ( splice->check_utr ) { regitr_t *itr = regitr_init(NULL); - const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,splice->vcf.rec)); + const char *chr = unify_chr_name(args, bcf_seqname(args->hdr,splice->vcf.rec),CHR_VCF,CHR_GFF); if ( regidx_overlap(args->idx_utr,chr,splice->ref_beg,ex_beg-1, itr) ) // adjacent utr csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq, splice->vcf.ial); regitr_destroy(itr); @@ -1244,7 +1495,7 @@ fprintf(bcftools_stderr,"mnp: %s>%s .. ex=%d,%d beg,end=%d,%d tbeg,tend=%d,%d if ( splice->check_utr ) { regitr_t *itr = regitr_init(NULL); - const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,splice->vcf.rec)); + const char *chr = unify_chr_name(args, bcf_seqname(args->hdr,splice->vcf.rec),CHR_VCF,CHR_GFF); if ( regidx_overlap(args->idx_utr,chr,ex_end+1,splice->ref_end, itr) ) // adjacent utr csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq, splice->vcf.ial); regitr_destroy(itr); @@ -1350,10 +1601,12 @@ int hap_init(args_t *args, hap_node_t *parent, hap_node_t *child, gf_cds_t *cds, if ( tr->strand==STRAND_FWD ) { if ( child->icds==tr->ncds-1 ) splice.check_stop = 1; } else if ( tr->strand==STRAND_REV ) { if ( child->icds==0 ) splice.check_stop = 1; } } - if ( splice.check_start ) // do not check starts in incomplete CDS, defined as not starting with M + if ( splice.check_start ) { - if ( tr->strand==STRAND_FWD ) { if ( dna2aa(TSCRIPT_AUX(tr)->ref+N_REF_PAD+cds->beg-tr->beg) != 'M' ) splice.check_start = 0; } - else if ( tr->strand==STRAND_REV ) { if ( cdna2aa(TSCRIPT_AUX(tr)->ref+N_REF_PAD+cds->beg-tr->beg+cds->len-3) != 'M' ) splice.check_start = 0; } + // Do not check starts in incomplete CDS, defined as not starting with M + // Not this is not always true, there are alternative start codons + if ( tr->strand==STRAND_FWD ) { if ( dna2stop(TSCRIPT_AUX(tr)->ref+N_REF_PAD+cds->beg-tr->beg) != 'M' ) splice.check_start = 0; } + else if ( tr->strand==STRAND_REV ) { if ( cdna2stop(TSCRIPT_AUX(tr)->ref+N_REF_PAD+cds->beg-tr->beg+cds->len-3) != 'M' ) splice.check_start = 0; } } if ( child->icds!=0 ) splice.check_region_beg = 1; if ( child->icds!=tr->ncds-1 ) splice.check_region_end = 1; @@ -1367,7 +1620,7 @@ fprintf(bcftools_stderr,"cds splice_csq: %d [%s][%s] .. beg,end=%d %d, ret=%d, c #endif if ( ret==SPLICE_VAR_REF ) return 2; // not a variant, eg REF=CA ALT=CA - if ( ret==SPLICE_OUTSIDE || ret==SPLICE_OVERLAP || splice.csq==CSQ_START_LOST ) // not a coding csq + if ( ret==SPLICE_OUTSIDE || ret==SPLICE_OVERLAP ) // not a coding csq { free(splice.kref.s); free(splice.kalt.s); @@ -1495,7 +1748,7 @@ void hap_destroy(hap_node_t *hap) tseq: translated sequence (aa) fill: frameshift, fill until the end (strand=fwd) or from the start (strand=rev) */ -void cds_translate(kstring_t *_ref, kstring_t *_seq, uint32_t sbeg, uint32_t rbeg, uint32_t rend, int strand, kstring_t *tseq, int fill) +void cds_translate(kstring_t *_ref, kstring_t *_seq, uint32_t sbeg, uint32_t rbeg, uint32_t rend, int strand, kstring_t *tseq, kstring_t *tseq_stop, int fill) { #if XDBG fprintf(bcftools_stderr,"\ntranslate: %d %d %d fill=%d seq.l=%d\n",sbeg,rbeg,rend,fill,(int)_seq->l); @@ -1507,9 +1760,11 @@ fprintf(bcftools_stderr,"\ntranslate: %d %d %d fill=%d seq.l=%d\n",sbeg,rbeg,r kstring_t seq = *_seq; tseq->l = 0; + tseq_stop->l = 0; if ( !seq.l ) { kputc('?', tseq); + kputc('?', tseq_stop); return; } @@ -1543,6 +1798,7 @@ fprintf(bcftools_stderr,"\ntranslate: %d %d %d fill=%d seq.l=%d\n",sbeg,rbeg,r if ( i==3 ) { kputc_(dna2aa(tmp), tseq); + kputc_(dna2stop(tmp), tseq_stop); #if DBG>1 fprintf(bcftools_stderr,"[1]%c%c%c\n",tmp[0],tmp[1],tmp[2]); #endif @@ -1551,6 +1807,7 @@ fprintf(bcftools_stderr,"\ntranslate: %d %d %d fill=%d seq.l=%d\n",sbeg,rbeg,r while ( codon < end ) { kputc_(dna2aa(codon), tseq); + kputc_(dna2stop(codon), tseq_stop); #if DBG>1 fprintf(bcftools_stderr,"[2]%c%c%c\n",codon[0],codon[1],codon[2]); #endif @@ -1574,6 +1831,7 @@ fprintf(bcftools_stderr,"\ntranslate: %d %d %d fill=%d seq.l=%d\n",sbeg,rbeg,r codon++; } kputc_(dna2aa(tmp), tseq); + kputc_(dna2stop(tmp), tseq_stop); #if DBG>1 fprintf(bcftools_stderr,"[4]%c%c%c\n",tmp[0],tmp[1],tmp[2]); #endif @@ -1584,6 +1842,7 @@ fprintf(bcftools_stderr,"\ntranslate: %d %d %d fill=%d seq.l=%d\n",sbeg,rbeg,r while ( codon+3 <= end ) { kputc_(dna2aa(codon), tseq); + kputc_(dna2stop(codon), tseq_stop); #if DBG>1 fprintf(bcftools_stderr,"[5]%c%c%c\t%c\n",codon[0],codon[1],codon[2],dna2aa(codon)); #endif @@ -1628,10 +1887,12 @@ fprintf(bcftools_stderr,"\ntranslate: %d %d %d fill=%d seq.l=%d\n",sbeg,rbeg,r fprintf(bcftools_stderr,"[1]%c%c%c\t%c\n",tmp[0],tmp[1],tmp[2], cdna2aa(tmp)); #endif kputc_(cdna2aa(tmp), tseq); + kputc_(cdna2stop(tmp), tseq_stop); codon = end - 3; while ( codon >= seq.s ) { kputc_(cdna2aa(codon), tseq); + kputc_(cdna2stop(codon), tseq_stop); #if DBG>1 fprintf(bcftools_stderr,"[2]%c%c%c\t%c\n",codon[0],codon[1],codon[2], cdna2aa(codon)); #endif @@ -1661,6 +1922,7 @@ fprintf(bcftools_stderr,"\ntranslate: %d %d %d fill=%d seq.l=%d\n",sbeg,rbeg,r { for (; i>=0 && end>=ref.s; i--) tmp[i] = *(--end); kputc_(cdna2aa(tmp), tseq); + kputc_(cdna2stop(tmp), tseq_stop); #if DBG>1 fprintf(bcftools_stderr,"[4]%c%c%c\t%c\n",tmp[0],tmp[1],tmp[2],cdna2aa(tmp)); #endif @@ -1671,6 +1933,7 @@ fprintf(bcftools_stderr,"\ntranslate: %d %d %d fill=%d seq.l=%d\n",sbeg,rbeg,r while ( codon >= ref.s + N_REF_PAD ) { kputc_(cdna2aa(codon), tseq); + kputc_(cdna2stop(codon), tseq_stop); #if DBG>1 fprintf(bcftools_stderr,"[5]%c%c%c\t%c\n",codon[0],codon[1],codon[2],cdna2aa(codon)); #endif @@ -1680,6 +1943,7 @@ fprintf(bcftools_stderr,"\ntranslate: %d %d %d fill=%d seq.l=%d\n",sbeg,rbeg,r } else error("Should not happen: %d\n", strand); kputc_(0,tseq); tseq->l--; + kputc_(0,tseq_stop); tseq_stop->l--; #if DBG fprintf(bcftools_stderr," tseq: %s\n", tseq->s); #endif @@ -1773,6 +2037,9 @@ fprintf(bcftools_stderr,"csq_push: %d .. %d\n",(int)rec->pos+1,csq->type.type); if ( vrec->vcsq[i].type&CSQ_STOP_RETAINED ) vrec->vcsq[i].type &= ~(CSQ_STOP_LOST|CSQ_SYNONYMOUS_VARIANT); + if ( vrec->vcsq[i].type&CSQ_START_RETAINED ) + vrec->vcsq[i].type &= ~(CSQ_START_LOST|CSQ_SYNONYMOUS_VARIANT); + if ( !vrec->vcsq[i].vstr.s ) vrec->vcsq[i].vstr = csq->type.vstr; goto exit_duplicate; } @@ -1870,14 +2137,14 @@ void kput_vcsq(args_t *args, vcsq_t *csq, kstring_t *str) kputs(csq->vstr.s, str); } -void kprint_aa_prediction(args_t *args, int beg, kstring_t *aa, kstring_t *str) +void kprint_aa_prediction(args_t *args, int beg, kstring_t *aa, kstring_t *stop, kstring_t *str) { if ( !args->brief_predictions || (int)aa->l - args->brief_predictions < 3 ) kputs(aa->s, str); else { int i, len = aa->l; - if ( aa->s[len-1]=='*' ) len--; + if ( stop->s[len-1]=='*' ) len--; for (i=0; ibrief_predictions; i++) kputc(aa->s[i], str); kputs("..", str); kputw(beg+len, str); @@ -1911,33 +2178,37 @@ void hap_add_csq(args_t *args, hap_t *hap, hap_node_t *node, int tlen, int ibeg, if ( hap->stack[ibeg].node->type != HAP_SSS ) { // check for truncating stops - for (i=0; itref.l; i++) - if ( hap->tref.s[i]=='*' ) break; - if ( i!=hap->tref.l ) + for (i=0; itref_stop.l; i++) + if ( hap->tref_stop.s[i]=='*' ) break; + if ( i!=hap->tref_stop.l ) { hap->tref.l = i+1; hap->tref.s[i+1] = 0; + hap->tref_stop.l = i+1; + hap->tref_stop.s[i+1] = 0; } - for (i=0; itseq.l; i++) - if ( hap->tseq.s[i]=='*' ) break; + for (i=0; itseq_stop.l; i++) + if ( hap->tseq_stop.s[i]=='*' ) break; if ( i!=hap->tseq.l ) { hap->tseq.l = i+1; hap->tseq.s[i+1] = 0; + hap->tseq_stop.l = i+1; + hap->tseq_stop.s[i+1] = 0; hap->upstream_stop = 1; } if ( csq->type.type & CSQ_STOP_LOST ) { - if ( hap->tref.s[hap->tref.l-1]=='*' && hap->tref.s[hap->tref.l-1] == hap->tseq.s[hap->tseq.l-1] ) + if ( hap->tref_stop.s[hap->tref_stop.l-1]=='*' && hap->tref_stop.s[hap->tref_stop.l-1] == hap->tseq_stop.s[hap->tseq_stop.l-1] ) { rm_csq |= CSQ_STOP_LOST; csq->type.type |= CSQ_STOP_RETAINED; } - else if ( hap->tref.s[hap->tref.l-1]!='*' ) + else if ( hap->tref_stop.s[hap->tref_stop.l-1]!='*' ) { // This is CDS 3' incomplete ENSG00000173376/synon.vcf, can also be missense // We observe in real data a change to a stop, ENST00000528237/retained-stop-incomplete-cds.vcf - if ( hap->tseq.s[hap->tseq.l-1] == '*' ) + if ( hap->tseq_stop.s[hap->tseq_stop.l-1] == '*' ) { rm_csq |= CSQ_STOP_GAINED; csq->type.type |= CSQ_STOP_RETAINED; @@ -1946,10 +2217,13 @@ void hap_add_csq(args_t *args, hap_t *hap, hap_node_t *node, int tlen, int ibeg, csq->type.type |= CSQ_INCOMPLETE_CDS; } } - if ( csq->type.type & CSQ_START_LOST && hap->tref.s[0]!='M' ) + if ( csq->type.type & CSQ_START_LOST ) { - rm_csq |= CSQ_START_LOST; - csq->type.type &= ~CSQ_START_LOST; + if ( hap->tref_stop.s[hap->tref_stop.l-1]=='M' && hap->tref_stop.s[hap->tref_stop.l-1] == hap->tseq_stop.s[hap->tseq_stop.l-1] ) + { + rm_csq |= CSQ_START_LOST; + csq->type.type |= CSQ_START_RETAINED; + } } if ( dlen!=0 ) { @@ -1959,7 +2233,7 @@ void hap_add_csq(args_t *args, hap_t *hap, hap_node_t *node, int tlen, int ibeg, csq->type.type |= CSQ_INFRAME_DELETION; else csq->type.type |= CSQ_INFRAME_INSERTION; - if ( hap->tref.s[hap->tref.l-1]!='*' && hap->tseq.s[hap->tseq.l-1]=='*' ) + if ( hap->tref_stop.s[hap->tref_stop.l-1]!='*' && hap->tseq_stop.s[hap->tseq_stop.l-1]=='*' ) csq->type.type |= CSQ_STOP_GAINED; } else @@ -1969,9 +2243,9 @@ void hap_add_csq(args_t *args, hap_t *hap, hap_node_t *node, int tlen, int ibeg, { if ( hap->tref.s[i] == hap->tseq.s[i] ) continue; aa_change = 1; - if ( hap->tref.s[i] == '*' ) + if ( hap->tref_stop.s[i] == '*' ) csq->type.type |= CSQ_STOP_LOST; - else if ( hap->tseq.s[i] == '*' ) + else if ( hap->tseq_stop.s[i] == '*' ) csq->type.type |= CSQ_STOP_GAINED; else csq->type.type |= CSQ_MISSENSE_VARIANT; @@ -1981,11 +2255,19 @@ void hap_add_csq(args_t *args, hap_t *hap, hap_node_t *node, int tlen, int ibeg, } } // Check if compound inframe variants are real inframes, or if the stop codon occurs before the frameshift can be restored - if ( ibeg!=iend && (csq->type.type & (CSQ_INFRAME_DELETION|CSQ_INFRAME_INSERTION|CSQ_INFRAME_ALTERING)) && hap->tseq.s[hap->tseq.l-1]=='*' ) + if ( ibeg!=iend && (csq->type.type & (CSQ_INFRAME_DELETION|CSQ_INFRAME_INSERTION|CSQ_INFRAME_ALTERING)) && hap->tseq_stop.s[hap->tseq_stop.l-1]=='*' ) { rm_csq |= CSQ_INFRAME_DELETION | CSQ_INFRAME_INSERTION | CSQ_INFRAME_ALTERING; csq->type.type |= CSQ_FRAMESHIFT_VARIANT | CSQ_STOP_GAINED; } + if ( csq->type.type & CSQ_FRAMESHIFT_VARIANT && csq->type.type & CSQ_START_LOST ) + { + // this is to prevent + // 4959 GA G start_lost|NBPF3|ENST00000318249|protein_coding|+ + // 4959 GA G start_lost|NBPF3|ENST00000318249|protein_coding|+|1M>1?|4959GA>G + rm_csq |= CSQ_FRAMESHIFT_VARIANT; + hap->stack[ibeg].node->type = HAP_SSS; + } if ( has_upstream_stop ) csq->type.type |= CSQ_UPSTREAM_STOP; csq->type.type &= ~rm_csq; @@ -2006,12 +2288,12 @@ void hap_add_csq(args_t *args, hap_t *hap, hap_node_t *node, int tlen, int ibeg, int aa_sbeg = tr->strand==STRAND_FWD ? node2sbeg(ibeg)/3+1 : (tlen - node2send(iend))/3+1; kputc_('|', &str); kputw(aa_rbeg, &str); - kprint_aa_prediction(args,aa_rbeg,&hap->tref,&str); + kprint_aa_prediction(args,aa_rbeg,&hap->tref,&hap->tref_stop,&str); if ( !(csq->type.type & CSQ_SYNONYMOUS_VARIANT) ) { kputc_('>', &str); kputw(aa_sbeg, &str); - kprint_aa_prediction(args,aa_sbeg,&hap->tseq,&str); + kprint_aa_prediction(args,aa_sbeg,&hap->tseq,&hap->tseq_stop,&str); } kputc_('|', &str); @@ -2082,6 +2364,7 @@ void hap_finalize(args_t *args, hap_t *hap) hap->sseq.l = 0; hap->tseq.l = 0; + hap->tseq_stop.l = 0; hap->stack[0].node = TSCRIPT_AUX(tr)->root; hap->stack[0].ichild = -1; hap->stack[0].slen = 0; @@ -2169,13 +2452,13 @@ void hap_finalize(args_t *args, hap_t *hap) } else // splice site overlap, see #1475227917 sseq.l = fill = 0; - cds_translate(&sref, &sseq, icur,rbeg,rend, tr->strand, &hap->tseq, fill); + cds_translate(&sref, &sseq, icur,rbeg,rend, tr->strand, &hap->tseq, &hap->tseq_stop, fill); // ref sseq.l = node2rend(i) - rbeg; sseq.s = sref.s + N_REF_PAD + rbeg; sseq.m = sref.m - 2*N_REF_PAD; - cds_translate(&sref, &sseq, rbeg,rbeg,rend, tr->strand, &hap->tref, fill); + cds_translate(&sref, &sseq, rbeg,rbeg,rend, tr->strand, &hap->tref, &hap->tref_stop, fill); sseq.m = sref.m - 2*N_REF_PAD + hap->stack[istack].dlen; hap_add_csq(args,hap,node,0, ibeg,i,dlen,indel); @@ -2227,13 +2510,13 @@ void hap_finalize(args_t *args, hap_t *hap) } else // splice site overlap, see #1475227917 sseq.l = fill = 0; - cds_translate(&sref, &sseq, icur,rbeg,rend, tr->strand, &hap->tseq, fill); + cds_translate(&sref, &sseq, icur,rbeg,rend, tr->strand, &hap->tseq, &hap->tseq_stop, fill); // ref sseq.l = node2rend(ibeg) - rbeg; sseq.s = sref.s + N_REF_PAD + rbeg; sseq.m = sref.m - 2*N_REF_PAD; - cds_translate(&sref, &sseq, rbeg,rbeg,rend, tr->strand, &hap->tref, fill); + cds_translate(&sref, &sseq, rbeg,rbeg,rend, tr->strand, &hap->tref, &hap->tref_stop, fill); sseq.m = sref.m - 2*N_REF_PAD + hap->stack[istack].dlen; hap_add_csq(args,hap,node,sseq.m, i,ibeg,dlen,indel); @@ -2491,13 +2774,7 @@ void tscript_init_ref(args_t *args, gf_tscript_t *tr, const char *chr) int i, len; int pad_beg = tr->beg >= N_REF_PAD ? N_REF_PAD : tr->beg; - const char *tmp_chr = chr; - if ( !faidx_has_seq(args->fai,tmp_chr) ) - { - tmp_chr = drop_chr_prefix(args,chr); - if ( !faidx_has_seq(args->fai,tmp_chr) ) tmp_chr = add_chr_prefix(args,chr); - } - TSCRIPT_AUX(tr)->ref = faidx_fetch_seq(args->fai, tmp_chr, tr->beg - pad_beg, tr->end + N_REF_PAD, &len); + TSCRIPT_AUX(tr)->ref = faidx_fetch_seq(args->fai, chr, tr->beg - pad_beg, tr->end + N_REF_PAD, &len); if ( !TSCRIPT_AUX(tr)->ref ) error("faidx_fetch_seq failed %s:%d-%d\n", chr,tr->beg+1,tr->end+1); @@ -2515,7 +2792,8 @@ void tscript_init_ref(args_t *args, gf_tscript_t *tr, const char *chr) } } -static void sanity_check_ref(args_t *args, gf_tscript_t *tr, bcf1_t *rec) +// returns 0 on success, negative number on reference mismatch +static int sanity_check_ref(args_t *args, gf_tscript_t *tr, bcf1_t *rec) { int vbeg = 0; int rbeg = rec->pos - tr->beg + N_REF_PAD; @@ -2527,23 +2805,40 @@ static void sanity_check_ref(args_t *args, gf_tscript_t *tr, bcf1_t *rec) while ( ref[i] && vcf[i] ) { if ( ref[i]!=vcf[i] && toupper(ref[i])!=toupper(vcf[i]) ) - error("Error: the fasta reference does not match the VCF REF allele at %s:%"PRId64" .. fasta=%c vcf=%c\n", - bcf_seqname(args->hdr,rec),(int64_t) rec->pos+vbeg+1,ref[i],vcf[i]); + { + if ( !args->force ) + error("Error: the fasta reference does not match the VCF REF allele at %s:%"PRId64" .. fasta=%c vcf=%c\n", + bcf_seqname(args->hdr,rec),(int64_t) rec->pos+vbeg+1,ref[i],vcf[i]); + + else if ( args->verbosity && (!args->warned.ref_allele_mismatch || args->verbosity > 1) ) + { + fprintf(bcftools_stderr,"Warning: the fasta reference does not match the VCF REF allele at %s:%"PRId64" .. fasta=%c vcf=%c\n", + bcf_seqname(args->hdr,rec),(int64_t) rec->pos+vbeg+1,ref[i],vcf[i]); + if ( args->verbosity < 2 ) + fprintf(bcftools_stderr," This message is printed only once, the verbosity can be increased with `--verbosity 2`\n"); + } + args->warned.ref_allele_mismatch++; + return -1; + } i++; } + return 0; } int test_cds_local(args_t *args, bcf1_t *rec) { int i,j, ret = 0; - const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,rec)); + const char *chr_vcf = bcf_seqname(args->hdr,rec); + const char *chr_gff = unify_chr_name(args, chr_vcf, CHR_VCF,CHR_GFF); + const char *chr_fai = unify_chr_name(args, chr_vcf, CHR_VCF,CHR_FAI); // note that the off-by-one extension of rlen is deliberate to account for insertions - if ( !regidx_overlap(args->idx_cds,chr,rec->pos,rec->pos+rec->rlen, args->itr) ) return 0; + if ( !regidx_overlap(args->idx_cds,chr_gff,rec->pos,rec->pos+rec->rlen, args->itr) ) return 0; // structures to fake the normal test_cds machinery hap_node_t root, node; root.type = HAP_ROOT; kstring_t *tref = &args->hap->tref, *tseq = &args->hap->tseq; + kstring_t *tref_stop = &args->hap->tref_stop, *tseq_stop = &args->hap->tseq_stop; while ( regitr_overlap(args->itr) ) { @@ -2555,12 +2850,12 @@ int test_cds_local(args_t *args, bcf1_t *rec) if ( !TSCRIPT_AUX(tr) ) { tr->aux = calloc(sizeof(tscript_t),1); - tscript_init_ref(args, tr, chr); + tscript_init_ref(args, tr, chr_fai); tscript_splice_ref(tr); khp_insert(trhp, args->active_tr, &tr); // only to clean the reference afterwards } - sanity_check_ref(args, tr, rec); + if ( sanity_check_ref(args, tr, rec)<0 ) continue; kstring_t sref; sref.s = TSCRIPT_AUX(tr)->sref; @@ -2596,40 +2891,44 @@ int test_cds_local(args_t *args, bcf1_t *rec) sseq.s = node.seq; int alen = sseq.l = strlen(sseq.s); int fill = node.dlen%3 && alen ? 1 : 0; // see #1475227917 - cds_translate(&sref, &sseq, node.sbeg,node.sbeg,node.sbeg+node.rlen, tr->strand, tseq, fill); + cds_translate(&sref, &sseq, node.sbeg,node.sbeg,node.sbeg+node.rlen, tr->strand, tseq, tseq_stop, fill); sseq.m = sref.m - 2*N_REF_PAD; sseq.s = sref.s + N_REF_PAD + node.sbeg; sseq.l = node.rlen; - cds_translate(&sref, &sseq, node.sbeg,node.sbeg,node.sbeg+node.rlen, tr->strand, tref, fill); + cds_translate(&sref, &sseq, node.sbeg,node.sbeg,node.sbeg+node.rlen, tr->strand, tref, tref_stop, fill); // check for truncating stops - for (j=0; jl; j++) - if ( tref->s[j]=='*' ) break; - if ( j!=tref->l ) + for (j=0; jl; j++) + if ( tref_stop->s[j]=='*' ) break; + if ( j!=tref_stop->l ) { tref->l = j+1; tref->s[j+1] = 0; + tref_stop->l = j+1; + tref_stop->s[j+1] = 0; } - for (j=0; jl; j++) - if ( tseq->s[j]=='*' ) break; + for (j=0; jl; j++) + if ( tseq_stop->s[j]=='*' ) break; if ( j!=tseq->l ) { tseq->l = j+1; tseq->s[j+1] = 0; + tseq_stop->l = j+1; + tseq_stop->s[j+1] = 0; } if ( csq_type & CSQ_STOP_LOST ) { - if ( tref->s[tref->l-1]=='*' && tref->s[tref->l-1] == tseq->s[tseq->l-1] ) + if ( tref_stop->s[tref_stop->l-1]=='*' && tref_stop->s[tref_stop->l-1] == tseq_stop->s[tseq_stop->l-1] ) { csq_type &= ~CSQ_STOP_LOST; csq_type |= CSQ_STOP_RETAINED; } - else if (tref->s[tref->l-1]!='*' ) + else if (tref_stop->s[tref_stop->l-1]!='*' ) { // This is CDS 3' incomplete ENSG00000173376/synon.vcf, can also be missense // We observe in real data a change to a stop, ENST00000528237/retained-stop-incomplete-cds.vcf - if ( tseq->s[tseq->l-1] == '*' ) + if ( tseq_stop->s[tseq_stop->l-1] == '*' ) { csq_type &= ~CSQ_STOP_GAINED; csq_type |= CSQ_STOP_RETAINED; @@ -2638,7 +2937,7 @@ int test_cds_local(args_t *args, bcf1_t *rec) csq_type |= CSQ_INCOMPLETE_CDS; } } - if ( csq_type & CSQ_START_LOST && tref->s[0]!='M' ) + if ( csq_type & CSQ_START_LOST && tref_stop->s[0]!='M' ) csq_type &= ~CSQ_START_LOST; if ( node.dlen!=0 ) { @@ -2648,8 +2947,20 @@ int test_cds_local(args_t *args, bcf1_t *rec) csq_type |= CSQ_INFRAME_DELETION; else csq_type |= CSQ_INFRAME_INSERTION; - if ( tref->s[tref->l-1]!='*' && tseq->s[tseq->l-1]=='*' ) + if ( tref_stop->s[tref_stop->l-1]!='*' && tseq_stop->s[tseq_stop->l-1]=='*' ) csq_type |= CSQ_STOP_GAINED; + if ( csq_type & CSQ_START_LOST && csq_type & CSQ_FRAMESHIFT_VARIANT ) + { + // this is to prevent + // 4959 GA G start_lost|NBPF3|ENST00000318249|protein_coding|+ + // 4959 GA G start_lost|NBPF3|ENST00000318249|protein_coding|+|1M>1?|4959GA>G + csq_type &= ~CSQ_FRAMESHIFT_VARIANT; + node.type = HAP_SSS; + csq_stage(args, &csq, rec); + free(node.seq); + free(node.var); + continue; + } } else { @@ -2658,9 +2969,9 @@ int test_cds_local(args_t *args, bcf1_t *rec) { if ( tref->s[j] == tseq->s[j] ) continue; aa_change = 1; - if ( tref->s[j] == '*' ) + if ( tref_stop->s[j] == '*' ) csq_type |= CSQ_STOP_LOST; - else if ( tseq->s[j] == '*' ) + else if ( tseq_stop->s[j] == '*' ) csq_type |= CSQ_STOP_GAINED; else csq_type |= CSQ_MISSENSE_VARIANT; @@ -2676,12 +2987,12 @@ int test_cds_local(args_t *args, bcf1_t *rec) int aa_sbeg = tr->strand==STRAND_FWD ? node.sbeg/3+1 : (TSCRIPT_AUX(tr)->nsref - 2*N_REF_PAD + node.dlen - node.sbeg - alen)/3+1; kputc_('|', &str); kputw(aa_rbeg, &str); - kprint_aa_prediction(args,aa_rbeg,tref,&str); + kprint_aa_prediction(args,aa_rbeg,tref,tref_stop,&str); if ( !(csq_type & CSQ_SYNONYMOUS_VARIANT) ) { kputc_('>', &str); kputw(aa_sbeg, &str); - kprint_aa_prediction(args,aa_sbeg,tseq,&str); + kprint_aa_prediction(args,aa_sbeg,tseq,tseq_stop,&str); } kputc_('|', &str); kputw(rec->pos+1, &str); @@ -2717,9 +3028,11 @@ int test_cds(args_t *args, bcf1_t *rec, vbuf_t *vbuf) static int overlaps_warned = 0, multiploid_warned = 0; int i, ret = 0, hap_ret; - const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,rec)); + const char *chr_vcf = bcf_seqname(args->hdr,rec); + const char *chr_gff = unify_chr_name(args, chr_vcf, CHR_VCF,CHR_GFF); + const char *chr_fai = unify_chr_name(args, chr_vcf, CHR_VCF,CHR_FAI); // note that the off-by-one extension of rlen is deliberate to account for insertions - if ( !regidx_overlap(args->idx_cds,chr,rec->pos,rec->pos+rec->rlen, args->itr) ) return 0; + if ( !regidx_overlap(args->idx_cds,chr_gff,rec->pos,rec->pos+rec->rlen, args->itr) ) return 0; while ( regitr_overlap(args->itr) ) { gf_cds_t *cds = regitr_payload(args->itr,gf_cds_t*); @@ -2731,7 +3044,7 @@ int test_cds(args_t *args, bcf1_t *rec, vbuf_t *vbuf) { // initialize the transcript and its haplotype tree, fetch the reference sequence tr->aux = calloc(sizeof(tscript_t),1); - tscript_init_ref(args, tr, chr); + tscript_init_ref(args, tr, chr_fai); TSCRIPT_AUX(tr)->root = (hap_node_t*) calloc(1,sizeof(hap_node_t)); TSCRIPT_AUX(tr)->nhap = args->phase==PHASE_DROP_GT ? 1 : 2*args->smpl->n; // maximum ploidy = diploid @@ -2743,7 +3056,7 @@ int test_cds(args_t *args, bcf1_t *rec, vbuf_t *vbuf) khp_insert(trhp, args->active_tr, &tr); } - sanity_check_ref(args, tr, rec); + if ( sanity_check_ref(args, tr, rec)<0 ) continue; if ( args->phase==PHASE_DROP_GT ) { @@ -2760,13 +3073,13 @@ int test_cds(args_t *args, bcf1_t *rec, vbuf_t *vbuf) { fprintf(bcftools_stderr, "Warning: Skipping overlapping variants at %s:%"PRId64"\t%s>%s.\n", - chr,(int64_t) rec->pos+1,rec->d.allele[0],rec->d.allele[1]); + chr_vcf,(int64_t) rec->pos+1,rec->d.allele[0],rec->d.allele[1]); if ( !overlaps_warned ) - fprintf(bcftools_stderr," This message is printed only once, the verbosity can be increased with `--verbose 2`\n"); + fprintf(bcftools_stderr," This message is printed only once, the verbosity can be increased with `--verbosity 2`\n"); overlaps_warned = 1; } if ( args->out ) - fprintf(args->out,"LOG\tWarning: Skipping overlapping variants at %s:%"PRId64"\t%s>%s\n", chr,(int64_t) rec->pos+1,rec->d.allele[0],rec->d.allele[1]); + fprintf(args->out,"LOG\tWarning: Skipping overlapping variants at %s:%"PRId64"\t%s>%s\n", chr_vcf,(int64_t) rec->pos+1,rec->d.allele[0],rec->d.allele[1]); } else ret = 1; // prevent reporting as intron in test_tscript hap_destroy(child); @@ -2807,13 +3120,13 @@ int test_cds(args_t *args, bcf1_t *rec, vbuf_t *vbuf) { fprintf(bcftools_stderr, "Warning: Skipping site with non-diploid/non-haploid genotypes at %s:%"PRId64"\t%s>%s.\n", - chr,(int64_t) rec->pos+1,rec->d.allele[0],rec->d.allele[1]); + chr_vcf,(int64_t) rec->pos+1,rec->d.allele[0],rec->d.allele[1]); if ( !multiploid_warned ) - fprintf(bcftools_stderr," This message is printed only once, the verbosity can be increased with `--verbose 2`\n"); + fprintf(bcftools_stderr," This message is printed only once, the verbosity can be increased with `--verbosity 2`\n"); multiploid_warned = 1; } if ( args->out ) - fprintf(args->out,"LOG\tWarning: Skipping site with non-diploid/non-haploid genotypes at %s:%"PRId64"\t%s>%s\n", chr,(int64_t) rec->pos+1,rec->d.allele[0],rec->d.allele[1]); + fprintf(args->out,"LOG\tWarning: Skipping site with non-diploid/non-haploid genotypes at %s:%"PRId64"\t%s>%s\n", chr_vcf,(int64_t) rec->pos+1,rec->d.allele[0],rec->d.allele[1]); continue; } for (ismpl=0; ismplsmpl->n; ismpl++) @@ -2830,7 +3143,7 @@ int test_cds(args_t *args, bcf1_t *rec, vbuf_t *vbuf) if ( !bcf_gt_is_phased(gt[0]) && !bcf_gt_is_phased(gt[1]) ) { if ( args->phase==PHASE_REQUIRE ) - error("Unphased heterozygous genotype at %s:%"PRId64", sample %s. See the --phase option.\n", chr,(int64_t) rec->pos+1,args->hdr->samples[args->smpl->idx[ismpl]]); + error("Unphased heterozygous genotype at %s:%"PRId64", sample %s. See the --phase option.\n", chr_vcf,(int64_t) rec->pos+1,args->hdr->samples[args->smpl->idx[ismpl]]); if ( args->phase==PHASE_SKIP ) continue; if ( args->phase==PHASE_NON_REF ) @@ -2873,14 +3186,14 @@ int test_cds(args_t *args, bcf1_t *rec, vbuf_t *vbuf) { fprintf(bcftools_stderr, "Warning: Skipping overlapping variants at %s:%"PRId64", sample %s\t%s>%s.\n", - chr,(int64_t) rec->pos+1,args->hdr->samples[args->smpl->idx[ismpl]],rec->d.allele[0],rec->d.allele[ial]); + chr_vcf,(int64_t) rec->pos+1,args->hdr->samples[args->smpl->idx[ismpl]],rec->d.allele[0],rec->d.allele[ial]); if ( !overlaps_warned ) - fprintf(bcftools_stderr," This message is printed only once, the verbosity can be increased with `--verbose 2`\n"); + fprintf(bcftools_stderr," This message is printed only once, the verbosity can be increased with `--verbosity 2`\n"); overlaps_warned = 1; } if ( args->out ) fprintf(args->out,"LOG\tWarning: Skipping overlapping variants at %s:%"PRId64", sample %s\t%s>%s\n", - chr,(int64_t) rec->pos+1,args->hdr->samples[args->smpl->idx[ismpl]],rec->d.allele[0],rec->d.allele[ial]); + chr_vcf,(int64_t) rec->pos+1,args->hdr->samples[args->smpl->idx[ismpl]],rec->d.allele[0],rec->d.allele[ial]); } hap_destroy(child); continue; @@ -2992,9 +3305,10 @@ void csq_stage(args_t *args, csq_t *csq, bcf1_t *rec) } int test_utr(args_t *args, bcf1_t *rec) { - const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,rec)); + const char *chr_vcf = bcf_seqname(args->hdr,rec); + const char *chr_gff = unify_chr_name(args, chr_vcf, CHR_VCF,CHR_GFF); // note that the off-by-one extension of rlen is deliberate to account for insertions - if ( !regidx_overlap(args->idx_utr,chr,rec->pos,rec->pos+rec->rlen, args->itr) ) return 0; + if ( !regidx_overlap(args->idx_utr,chr_gff,rec->pos,rec->pos+rec->rlen, args->itr) ) return 0; splice_t splice; splice_init(&splice, rec); @@ -3030,8 +3344,9 @@ int test_utr(args_t *args, bcf1_t *rec) } int test_splice(args_t *args, bcf1_t *rec) { - const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,rec)); - if ( !regidx_overlap(args->idx_exon,chr,rec->pos,rec->pos + rec->rlen, args->itr) ) return 0; + const char *chr_vcf = bcf_seqname(args->hdr,rec); + const char *chr_gff = unify_chr_name(args, chr_vcf, CHR_VCF,CHR_GFF); + if ( !regidx_overlap(args->idx_exon,chr_gff,rec->pos,rec->pos + rec->rlen, args->itr) ) return 0; splice_t splice; splice_init(&splice, rec); @@ -3062,8 +3377,9 @@ int test_splice(args_t *args, bcf1_t *rec) } int test_tscript(args_t *args, bcf1_t *rec) { - const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,rec)); - if ( !regidx_overlap(args->idx_tscript,chr,rec->pos,rec->pos+rec->rlen, args->itr) ) return 0; + const char *chr_vcf = bcf_seqname(args->hdr,rec); + const char *chr_gff = unify_chr_name(args, chr_vcf, CHR_VCF,CHR_GFF); + if ( !regidx_overlap(args->idx_tscript,chr_gff,rec->pos,rec->pos+rec->rlen, args->itr) ) return 0; splice_t splice; splice_init(&splice, rec); @@ -3105,7 +3421,8 @@ void test_symbolic_alt(args_t *args, bcf1_t *rec) warned = 1; } - const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,rec)); + const char *chr_vcf = bcf_seqname(args->hdr,rec); + const char *chr_gff = unify_chr_name(args, chr_vcf, CHR_VCF,CHR_GFF); // only insertions atm int beg = rec->pos + 1; @@ -3113,7 +3430,7 @@ void test_symbolic_alt(args_t *args, bcf1_t *rec) int csq_class = CSQ_ELONGATION; int hit = 0; - if ( regidx_overlap(args->idx_cds,chr,beg,end, args->itr) ) + if ( regidx_overlap(args->idx_cds,chr_gff,beg,end, args->itr) ) { while ( regitr_overlap(args->itr) ) { @@ -3131,7 +3448,7 @@ void test_symbolic_alt(args_t *args, bcf1_t *rec) hit = 1; } } - if ( regidx_overlap(args->idx_utr,chr,beg,end, args->itr) ) + if ( regidx_overlap(args->idx_utr,chr_gff,beg,end, args->itr) ) { while ( regitr_overlap(args->itr) ) { @@ -3149,7 +3466,7 @@ void test_symbolic_alt(args_t *args, bcf1_t *rec) hit = 1; } } - if ( regidx_overlap(args->idx_exon,chr,beg,end, args->itr) ) + if ( regidx_overlap(args->idx_exon,chr_gff,beg,end, args->itr) ) { splice_t splice; splice_init(&splice, rec); @@ -3168,7 +3485,7 @@ void test_symbolic_alt(args_t *args, bcf1_t *rec) if ( splice.csq ) hit = 1; } } - if ( !hit && regidx_overlap(args->idx_tscript,chr,beg,end, args->itr) ) + if ( !hit && regidx_overlap(args->idx_tscript,chr_gff,beg,end, args->itr) ) { splice_t splice; splice_init(&splice, rec); @@ -3229,6 +3546,7 @@ static void process(args_t *args, bcf1_t **rec_ptr) bcf1_t *rec = *rec_ptr; static int32_t prev_rid = -1, prev_pos = -1; + const char *chr_vcf = bcf_seqname(args->hdr,rec); if ( prev_rid!=rec->rid ) { prev_rid = rec->rid; @@ -3237,14 +3555,28 @@ static void process(args_t *args, bcf1_t **rec_ptr) // Common error is to use different naming conventions in the fasta and the VCF (e.g. X vs chrX). // Perform a simple sanity check (that does not catch much), the chromosome must be present in the // reference file - if ( !faidx_has_seq(args->fai,bcf_seqname(args->hdr,rec)) ) + const char *chr_fai = unify_chr_name(args, chr_vcf, CHR_VCF,CHR_FAI); + if ( !faidx_has_seq(args->fai,chr_fai) ) { - if ( !faidx_has_seq(args->fai,drop_chr_prefix(args,bcf_seqname(args->hdr,rec))) && !faidx_has_seq(args->fai,add_chr_prefix(args,bcf_seqname(args->hdr,rec))) ) - error("Error: the chromosome \"%s\" is not present in %s\n",bcf_seqname(args->hdr,rec),args->fa_fname); + static int missing_chr_fai_warned = 0; + if ( !args->force ) + error("Error: the chromosome \"%s\" is not present in %s\n %s\n",chr_fai,args->fa_fname,args->unify_chr_names_err); + else if ( !missing_chr_fai_warned++ ) + fprintf(bcftools_stderr,"Warning: the chromosome \"%s\" is not present in %s. This warning is printed only once.\n",chr_fai,args->fa_fname); + } + + const char *chr_gff = unify_chr_name(args, chr_vcf, CHR_VCF,CHR_GFF); + if ( !gff_has_seq(args->gff,chr_gff) ) + { + static int missing_chr_gff_warned = 0; + if ( !args->force ) + error("Error: the chromosome \"%s\" is not present in %s\n %s\n",chr_gff,args->gff_fname,args->unify_chr_names_err); + else if ( !missing_chr_gff_warned++ ) + fprintf(bcftools_stderr,"Warning: the chromosome \"%s\" is not present in %s. This warning is printed only once.\n",chr_gff,args->gff_fname); } } if ( prev_pos > rec->pos ) - error("Error: The file is not sorted, %s:%d comes before %s:%"PRId64"\n",bcf_seqname(args->hdr,rec),prev_pos+1,bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); + error("Error: The file is not sorted, %s:%d comes before %s:%"PRId64"\n",chr_vcf,prev_pos+1,bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); int call_csq = 1; if ( rec->n_allele < 2 ) call_csq = 0; // no alternate allele @@ -3307,6 +3639,7 @@ static const char *usage(void) "\n" "CSQ options:\n" " -B, --trim-protein-seq INT Abbreviate protein-changing predictions to max INT aminoacids\n" + " -C, --genetic-code INT|l Specify the genetic code table to use, 'l' to print a list [0]\n" " -c, --custom-tag STRING Use this tag instead of the default BCSQ\n" " -l, --local-csq Localized predictions, consider only one VCF record at a time\n" " -n, --ncsq INT Maximum number of per-haplotype consequences to consider for each site [15]\n" @@ -3319,7 +3652,8 @@ static const char *usage(void) "GFF options:\n" " --dump-gff FILE.gz Dump the parsed GFF file (for debugging purposes)\n" " --force Run even if some sanity checks fail\n" - " --unify-chr-names 1|0 Automatically unify chromosome naming (e.g. chrX vs X) in GFF, fasta, and VCF [1]\n" + " --unify-chr-names 0|LIST Unify chromosome naming by stripping a prefix in VCF,GFF,fasta, respectively [0]\n" + " (e.g., \"chr,Chr,-\" trims \"chr\" in VCF and \"Chr\" in GFF, fasta is unchanged)\n" "General options:\n" " -e, --exclude EXPR Exclude sites for which the expression is true\n" " -i, --include EXPR Select sites for which the expression is true\n" @@ -3336,7 +3670,7 @@ static const char *usage(void) " -T, --targets-file FILE Similar to -R but streams rather than index-jumps\n" " --targets-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n" " --threads INT Use multithreading with worker threads [0]\n" - " -v, --verbose INT Verbosity level 0-2 [1]\n" + " -v, --verbosity INT Verbosity level 0-6 [1]\n" " -W, --write-index[=FMT] Automatically index the output files [off]\n" "\n" "Example:\n" @@ -3358,11 +3692,11 @@ int main_csq(int argc, char *argv[]) args->verbosity = 1; args->record_cmd_line = 1; args->clevel = -1; - args->unify_chr_names = 1; static struct option loptions[] = { {"force",0,0,1}, + {"genetic-code",required_argument,NULL,'C'}, {"threads",required_argument,NULL,2}, {"help",0,0,'h'}, {"ncsq",1,0,'n'}, @@ -3379,6 +3713,7 @@ int main_csq(int argc, char *argv[]) {"phase",1,0,'p'}, {"quiet",0,0,'q'}, {"verbose",1,0,'v'}, + {"verbosity",1,0,'v'}, {"regions",1,0,'r'}, {"regions-file",1,0,'R'}, {"regions-overlap",required_argument,NULL,4}, @@ -3397,7 +3732,7 @@ int main_csq(int argc, char *argv[]) int regions_overlap = 1; int targets_overlap = 0; char *targets_list = NULL, *regions_list = NULL, *tmp; - while ((c = getopt_long(argc, argv, "?hr:R:t:T:i:e:f:o:O:g:s:S:p:qc:ln:bB:v:W::",loptions,NULL)) >= 0) + while ((c = getopt_long(argc, argv, "?hr:R:t:T:i:e:f:o:O:g:s:S:p:qc:C:ln:bB:v:W::",loptions,NULL)) >= 0) { switch (c) { @@ -3416,11 +3751,13 @@ int main_csq(int argc, char *argv[]) if ( *tmp || args->brief_predictions<1 ) error("Could not parse argument: --trim-protein-seq %s\n", optarg); break; case 'l': args->local_csq = 1; break; + case 'C': args->gencode_str = optarg; break; case 'c': args->bcsq_tag = optarg; break; case 'q': error("Error: the -q option has been deprecated, use -v, --verbose instead.\n"); break; case 'v': args->verbosity = atoi(optarg); - if ( args->verbosity<0 || args->verbosity>2 ) error("Error: expected integer 0-2 with -v, --verbose\n"); + if ( args->verbosity<0 ) error("Error: expected integer with -v, --verbosity\n"); + if ( args->verbosity > 3 ) hts_verbose = args->verbosity; break; case 'p': switch (optarg[0]) @@ -3484,16 +3821,14 @@ int main_csq(int argc, char *argv[]) error("Unsupported index format '%s'\n", optarg); break; case 7 : args->dump_gff = optarg; break; - case 8 : - if ( !strcmp(optarg,"0") ) args->unify_chr_names = 0; - else if ( !strcmp(optarg,"1") ) args->unify_chr_names = 1; - else error("Could not parse: --unify-chr-names %s\n",optarg); - break; + case 8 : args->unify_chr_names = optarg; break; case 'h': case '?': error("%s",usage()); default: error("The option not recognised: %s\n\n", optarg); break; } } + init_gencode(args); + char *fname = NULL; if ( optind==argc ) { diff --git a/bcftools/filter.c b/bcftools/filter.c index c9dcd023..2e74f0a2 100644 --- a/bcftools/filter.c +++ b/bcftools/filter.c @@ -1,6 +1,6 @@ /* filter.c -- filter expressions. - Copyright (C) 2013-2024 Genome Research Ltd. + Copyright (C) 2013-2025 Genome Research Ltd. Author: Petr Danecek @@ -73,7 +73,7 @@ typedef struct _token_t char *tag; // for debugging and printout only, VCF tag name double threshold; // filtering threshold int is_constant; // the threshold is set - int hdr_id, hl_type, ht_type; // BCF header lookup ID and one of BCF_HL_* types and BCF_HT_* types + int hdr_id, hl_type, ht_type, vl_len; // BCF header lookup ID and one of BCF_HL_*, BCF_HT_*, BCF_VL_* types int idx; // 0-based index to VCF vectors, // -2: list (e.g. [0,1,2] or [1..3] or [1..] or any field[*], which is equivalent to [0..]) // -3: select indices on the fly based on values in GT @@ -167,6 +167,7 @@ struct _filter_t #define TOK_NOT_IN 39 // does not contain, e.g. FILTER!~"A" #define TOK_MODULO 40 // % #define TOK_EXT 41 // external values set before each filter_test_ext() call, can be one of {},{str},{int},{float} +#define TOK_FISHER 42 // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 // ( ) [ < = > ] ! | & + - * / M m a A O ~ ^ S . l f c p b P i s % @@ -219,9 +220,10 @@ static int filters_next_token(char **str, int *len) if ( !strncasecmp(tmp,"STDEV(",6) ) { (*str) += 5; return TOK_STDEV; } if ( !strncasecmp(tmp,"SUM(",4) ) { (*str) += 3; return TOK_SUM; } if ( !strncasecmp(tmp,"ABS(",4) ) { (*str) += 3; return TOK_ABS; } - if ( !strncasecmp(tmp,"COUNT(",4) ) { (*str) += 5; return TOK_CNT; } + if ( !strncasecmp(tmp,"COUNT(",6) ) { (*str) += 5; return TOK_CNT; } if ( !strncasecmp(tmp,"STRLEN(",7) ) { (*str) += 6; return TOK_LEN; } if ( !strncasecmp(tmp,"BINOM(",6) ) { (*str) += 5; return -TOK_BINOM; } + if ( !strncasecmp(tmp,"FISHER(",6) ) { (*str) += 6; return -TOK_FISHER; } if ( !strncasecmp(tmp,"PHRED(",6) ) { (*str) += 5; return TOK_PHRED; } if ( !strncasecmp(tmp,"%MAX(",5) ) { (*str) += 4; return TOK_MAX; } // for backward compatibility if ( !strncasecmp(tmp,"%MIN(",5) ) { (*str) += 4; return TOK_MIN; } // for backward compatibility @@ -324,10 +326,6 @@ static int filters_next_token(char **str, int *len) return TOK_VAL; } -#define FILTER_OK 0 -#define FILTER_ERR_UNKN_TAGS 1 -#define FILTER_ERR_OTHER 2 - static void filter_add_undef_tag(filter_t *filter, char *str) { int i; @@ -1191,12 +1189,9 @@ static void filters_set_format_string(filter_t *flt, bcf1_t *line, token_t *tok) } static void _filters_set_genotype(filter_t *flt, bcf1_t *line, token_t *tok, int type) { + tok->nvalues = tok->str_value.l = 0; bcf_fmt_t *fmt = bcf_get_fmt(flt->hdr, line, "GT"); - if ( !fmt ) - { - tok->nvalues = tok->str_value.l = 0; - return; - } + if ( !fmt ) return; int i,j, nsmpl = bcf_hdr_nsamples(flt->hdr), nvals1 = type==2 ? 3 : 4; if ( tok->str_value.m <= nvals1*nsmpl ) @@ -1276,12 +1271,10 @@ static void filters_set_genotype4(filter_t *flt, bcf1_t *line, token_t *tok) { _ static void filters_set_genotype_string(filter_t *flt, bcf1_t *line, token_t *tok) { + tok->nvalues = tok->str_value.l = 0; bcf_fmt_t *fmt = bcf_get_fmt(flt->hdr, line, "GT"); - if ( !fmt ) - { - tok->nvalues = 0; - return; - } + if ( !fmt ) return; + int i, blen = 4, nsmpl = line->n_sample; gt_length_too_big: @@ -2036,6 +2029,154 @@ static int func_strlen(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **sta } return 1; } +static int func_fisher(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack) +{ + int i, istack = nstack - rtok->nargs; + token_t *tok = stack[istack]; + token_t *tok2 = istack+2==nstack ? stack[istack+1] : NULL; + if ( !tok->nsamples ) + { + // INFO tag, such as DP4 + rtok->nvalues = 1; + hts_expand(double, rtok->nvalues, rtok->mvalues, rtok->values); + double *n11 = NULL, *n12 = NULL, *n21 = NULL, *n22 = NULL; + if ( istack+1==nstack ) + { + // only one tag, expecting four values: binom(INFO/DP4) + if ( tok->nvalues==4 ) + { + n11 = &tok->values[0]; + n12 = &tok->values[1]; + n21 = &tok->values[2]; + n22 = &tok->values[3]; + } + } + else if ( istack+2==nstack ) + { + // two tags, expecting two values in each: binom(INFO/ADF[0,2],INFO/ADR[0,2]) + if ( tok->nvalues>=2 && tok2->nvalues>=2 ) + { + n11 = &tok->values[0]; + n21 = &tok->values[1]; + n12 = &tok2->values[0]; + n22 = &tok2->values[1]; + } + } + if ( !n11 || !n12 || !n21 || !n22 + || bcf_double_is_missing_or_vector_end(n11[0]) + || bcf_double_is_missing_or_vector_end(n12[0]) + || bcf_double_is_missing_or_vector_end(n21[0]) + || bcf_double_is_missing_or_vector_end(n22[0]) ) bcf_double_set_missing(rtok->values[0]); + else + { + double left,right,two; + kt_fisher_exact(n11[0],n12[0],n21[0],n22[0], &left, &right, &two); + rtok->values[0] = two; + } + } + else + { + rtok->nval1 = 1; + rtok->nvalues = tok->nsamples; + rtok->nsamples = tok->nsamples; + hts_expand(double, rtok->nvalues, rtok->mvalues, rtok->values); + assert(tok->usmpl); + if ( !rtok->usmpl ) rtok->usmpl = (uint8_t*) malloc(tok->nsamples); + memcpy(rtok->usmpl, tok->usmpl, tok->nsamples); + + if ( istack+1==nstack && tok->nval1==4 ) + { + // only one tag, expecting four values: fisher(FORMAT/DP4) + for (i=0; insamples; i++) + { + if ( !rtok->usmpl[i] ) continue; + double *n11 = tok->values + tok->nval1*i; + double *n12 = tok->values + tok->nval1*i+1; + double *n21 = tok->values + tok->nval1*i+2; + double *n22 = tok->values + tok->nval1*i+3; + if ( !n11 || !n12 || !n21 || !n22 + || bcf_double_is_missing_or_vector_end(n11[0]) + || bcf_double_is_missing_or_vector_end(n12[0]) + || bcf_double_is_missing_or_vector_end(n21[0]) + || bcf_double_is_missing_or_vector_end(n22[0]) ) bcf_double_set_missing(rtok->values[i]); + else + { + double left,right,two; + kt_fisher_exact(n11[0],n12[0],n21[0],n22[0], &left, &right, &two); + rtok->values[i] = two; + } + } + } + else if ( istack+2==nstack && tok->vl_len==BCF_VL_R && tok2->vl_len==BCF_VL_R && tok->nuidxs+tok2->nuidxs==4 ) + { + // two Number=R tags with explicit indices, e.g. fisher(FORMAT/ADF[:0,1],FORMAT/ADR[:0,1]) + for (i=0; insamples; i++) + { + if ( !rtok->usmpl[i] ) continue; + double *n11 = tok->values + tok->nval1*i; + double *n12 = tok->values + tok->nval1*i+1; + double *n21 = tok2->values + tok2->nval1*i; + double *n22 = tok2->values + tok2->nval1*i+1; + if ( !n11 || !n12 || !n21 || !n22 + || bcf_double_is_missing_or_vector_end(n11[0]) + || bcf_double_is_missing_or_vector_end(n12[0]) + || bcf_double_is_missing_or_vector_end(n21[0]) + || bcf_double_is_missing_or_vector_end(n22[0]) ) bcf_double_set_missing(rtok->values[i]); + else + { + double left,right,two; + kt_fisher_exact(n11[0],n12[0],n21[0],n22[0], &left, &right, &two); + rtok->values[i] = two; + } + } + } + else if ( istack+2==nstack && tok->vl_len==BCF_VL_R && tok2->vl_len==BCF_VL_R ) + { + // two Number=R tags, fisher(FORMAT/ADF,FORMAT/ADR), take thae ALT allele index from GT + int ngt = bcf_get_genotypes(flt->hdr, line, &flt->tmpi, &flt->mtmpi); + int max_ploidy = ngt/line->n_sample; + if ( ngt <= 0 || max_ploidy < 2 ) // GT not present or not diploid, cannot set + { + for (i=0; insamples; i++) + if ( rtok->usmpl[i] ) bcf_double_set_missing(rtok->values[i]); + return rtok->nargs; + } + for (i=0; insamples; i++) + { + if ( !rtok->usmpl[i] ) continue; + int32_t *ptr = flt->tmpi + i*max_ploidy; + if ( bcf_gt_is_missing(ptr[0]) || bcf_gt_is_missing(ptr[1]) || ptr[1]==bcf_int32_vector_end ) + { + bcf_double_set_missing(rtok->values[i]); + continue; + } + int idx1 = bcf_gt_allele(ptr[0]); + int idx2 = bcf_gt_allele(ptr[1]); + if ( idx1>=line->n_allele ) error("Incorrect allele index at %s:%"PRId64", sample %s\n", bcf_seqname(flt->hdr,line),(int64_t) line->pos+1,flt->hdr->samples[i]); + if ( idx2>=line->n_allele ) error("Incorrect allele index at %s:%"PRId64", sample %s\n", bcf_seqname(flt->hdr,line),(int64_t) line->pos+1,flt->hdr->samples[i]); + double *vals = tok->values + tok->nval1*i; + double *vals2 = tok2->values + tok2->nval1*i; + double *n11 = &vals[idx1]; + double *n12 = &vals[idx2]; + double *n21 = &vals2[idx1]; + double *n22 = &vals2[idx2]; + if ( !n11 || !n12 || !n21 || !n22 + || bcf_double_is_missing_or_vector_end(n11[0]) + || bcf_double_is_missing_or_vector_end(n12[0]) + || bcf_double_is_missing_or_vector_end(n21[0]) + || bcf_double_is_missing_or_vector_end(n22[0]) ) + { + bcf_double_set_missing(rtok->values[i]); + continue; + } + double left,right,two; + kt_fisher_exact(n11[0],n12[0],n21[0],n22[0], &left, &right, &two); + rtok->values[i] = two; + } + } + } + return rtok->nargs; +} static int func_binom(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack) { int i, istack = nstack - rtok->nargs; @@ -2181,7 +2322,6 @@ static int func_phred(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stac rtok->nsamples = tok->nsamples; rtok->nval1 = tok->nval1; memcpy(rtok->pass_samples, tok->pass_samples, rtok->nsamples*sizeof(*rtok->pass_samples)); - assert(tok->usmpl); if ( !rtok->usmpl ) { rtok->usmpl = (uint8_t*) malloc(tok->nsamples*sizeof(*rtok->usmpl)); @@ -2618,9 +2758,11 @@ static int _regex_vector_strings(regex_t *regex, char *str, size_t len, int logi char *mid = str; while ( mid < end && *mid && *mid!=',' ) mid++; int miss = mid - str == 1 && str[0]=='.' ? 1 : 0; - if ( miss && missing_logic[miss] ) return 1; + int match = ( miss && missing_logic[miss] ) ? 1 : 0; + if ( logic==TOK_NLIKE ) match = match ? 0 : 1; + if ( match ) return 1; char tmp = *mid; *mid = 0; - int match = regexec(regex, str, 0,NULL,0) ? 0 : 1; + match = regexec(regex, str, 0,NULL,0) ? 0 : 1; *mid = tmp; if ( logic==TOK_NLIKE ) match = match ? 0 : 1; if ( match ) return 1; @@ -2707,6 +2849,7 @@ static void cmp_vector_strings(token_t *atok, token_t *btok, token_t *rtok) { token_t *tok = atok->regex ? btok : atok; rtok->pass_site = _regex_vector_strings(regex, tok->str_value.s, tok->str_value.l, logic, missing_logic); + fprintf(stderr,"pass=%d [%s]\n",rtok->pass_site,tok->str_value.s); } return; } @@ -2955,6 +3098,7 @@ static int max_ac_an_unpack(bcf_hdr_t *hdr) } static int filters_init1_ext(filter_t *filter, char *str, int len, token_t *tok) { + tok->vl_len = BCF_VL_FIXED; tok->hl_type = -1; tok->ht_type = -1; tok->tok_type = TOK_VAL; @@ -2971,6 +3115,7 @@ static int filters_init1_ext(filter_t *filter, char *str, int len, token_t *tok) } static int filters_init1(filter_t *filter, char *str, int len, token_t *tok) { + tok->vl_len = BCF_VL_FIXED; tok->ht_type = -1; tok->hl_type = -1; tok->tok_type = TOK_VAL; @@ -3168,6 +3313,7 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok) } tok->hl_type = is_fmt ? BCF_HL_FMT : BCF_HL_INFO; + if ( tok->hdr_id >= 0 ) tok->vl_len = bcf_hdr_id2length(filter->hdr,tok->hl_type,tok->hdr_id); if ( is_fmt ) filter->max_unpack |= BCF_UN_FMT; if ( tok->hdr_id>=0 ) { @@ -3502,6 +3648,49 @@ static void determine_ext_types(filter_t *filter, int ntok, token_t *tok) } } +// Same as hts_readlist but recognizes brackets, () and [], and does not split by comma inside +// Possible todo: the escaping is simplistic, does not check the syntax, so "([)]" is not distinguishable from "([])" +char **parse_tag_list(const char *string, int *_n) +{ + *_n = 0; + unsigned int m = 0, n = 0; + char **s = 0, **s_new; + const char *q = string, *p = string; + int escape_bracket = 0; + while ( 1 ) + { + if ((*p == ',' && !escape_bracket) || *p == 0) + { + if (hts_resize(char*, n + 1, &m, &s, 0) < 0) + goto err; + s[n] = (char*)calloc(p - q + 1, 1); + if (!s[n]) + goto err; + strncpy(s[n++], q, p - q); + q = p + 1; + } + if ( !*p ) break; + if ( *p=='[' || *p=='(' ) escape_bracket++; + if ( (*p==']' || *p==')') && escape_bracket ) escape_bracket--; + p++; + } + + // Try to shrink s to the minimum size needed + s_new = (char**)realloc(s, n * sizeof(char*)); + if (!s_new) + goto err; + + s = s_new; + assert(n < INT_MAX); // hts_resize() should ensure this + *_n = n; + return s; + +err: + for (m = 0; m < n; m++) + free(s[m]); + free(s); + return NULL; +} // Parse filter expression and convert to reverse polish notation. Dijkstra's shunting-yard algorithm static filter_t *filter_init_(bcf_hdr_t *hdr, const char *str, int exit_on_error) @@ -3573,7 +3762,7 @@ static filter_t *filter_init_(bcf_hdr_t *hdr, const char *str, int exit_on_error } else if ( ret == -TOK_FUNC ) { - // this is different from TOK_PERLSUB,TOK_BINOM in that the expression inside the + // this is different from TOK_PERLSUB,TOK_BINOM,TOK_FISHER in that the expression inside the // brackets gets evaluated as normal expression nops++; hts_expand0(token_t, nops, mops, ops); @@ -3597,7 +3786,7 @@ static filter_t *filter_init_(bcf_hdr_t *hdr, const char *str, int exit_on_error else error("The function \"%s\" is not supported\n", tmp-len); continue; } - else if ( ret < 0 ) // variable number of arguments: TOK_PERLSUB,TOK_BINOM + else if ( ret < 0 ) // variable number of arguments: TOK_PERLSUB,TOK_BINOM,TOK_FISHER { ret = -ret; @@ -3609,7 +3798,7 @@ static filter_t *filter_init_(bcf_hdr_t *hdr, const char *str, int exit_on_error if ( ret == TOK_PERLSUB ) { while ( *beg && ((isalnum(*beg) && !ispunct(*beg)) || *beg=='_') ) beg++; - if ( *beg!='(' ) error("Could not parse the expression: %s\n", str); + if ( *beg!='(' ) error("[%s:%d] Could not parse the expression: %s\n", __FILE__,__LINE__,str); // the subroutine name kputc('"', &rmme); @@ -3622,12 +3811,12 @@ static filter_t *filter_init_(bcf_hdr_t *hdr, const char *str, int exit_on_error } char *end = beg; while ( *end && *end!=')' ) end++; - if ( !*end ) error("Could not parse the expression: %s\n", str); + if ( !*end ) error("[%s:%d] Could not parse the expression: %s\n", __FILE__,__LINE__,str); // subroutine arguments rmme.l = 0; kputsn(beg+1, end-beg-1, &rmme); - char **rmme_list = hts_readlist(rmme.s, 0, &margs); + char **rmme_list = parse_tag_list(rmme.s, &margs); for (i=0; iexit_on_error ) + error("[%s:%d %s] Error: could not parse the expression \"%s\"\n", __FILE__,__LINE__,__FUNCTION__,filter->str); + filter->status |= FILTER_ERR_OTHER; + } if ( filter->status != FILTER_OK ) { if ( mops ) free(ops); @@ -3745,6 +3940,7 @@ static filter_t *filter_init_(bcf_hdr_t *hdr, const char *str, int exit_on_error if ( type==BCF_HT_INT ) set_missing = 1; else if ( type==BCF_HT_REAL ) set_missing = 1; } + else if ( !out[k].tag ) error("Error: could not parse the expression\n"); // e.g. =~ else if ( !strcmp("QUAL",out[k].tag) ) set_missing = 1; if ( set_missing ) { out[j].is_str = 0; out[j].is_missing = 1; bcf_double_set_missing(out[j].values[0]); } } @@ -3890,6 +4086,7 @@ static filter_t *filter_init_(bcf_hdr_t *hdr, const char *str, int exit_on_error else if ( out[i].tok_type==TOK_LEN ) { out[i].func = func_strlen; out[i].tok_type = TOK_FUNC; } else if ( out[i].tok_type==TOK_PHRED ) { out[i].func = func_phred; out[i].tok_type = TOK_FUNC; } else if ( out[i].tok_type==TOK_BINOM ) { out[i].func = func_binom; out[i].tok_type = TOK_FUNC; } + else if ( out[i].tok_type==TOK_FISHER ) { out[i].func = func_fisher; out[i].tok_type = TOK_FUNC; } else if ( out[i].tok_type==TOK_PERLSUB ) { out[i].func = perl_exec; out[i].tok_type = TOK_FUNC; } else if ( out[i].tok_type==TOK_sMAX ) { out[i].func = func_smpl_max; out[i].tok_type = TOK_FUNC; } else if ( out[i].tok_type==TOK_sMIN ) { out[i].func = func_smpl_min; out[i].tok_type = TOK_FUNC; } diff --git a/bcftools/filter.c.pysam.c b/bcftools/filter.c.pysam.c index 2db56801..e3d14aff 100644 --- a/bcftools/filter.c.pysam.c +++ b/bcftools/filter.c.pysam.c @@ -2,7 +2,7 @@ /* filter.c -- filter expressions. - Copyright (C) 2013-2024 Genome Research Ltd. + Copyright (C) 2013-2025 Genome Research Ltd. Author: Petr Danecek @@ -75,7 +75,7 @@ typedef struct _token_t char *tag; // for debugging and printout only, VCF tag name double threshold; // filtering threshold int is_constant; // the threshold is set - int hdr_id, hl_type, ht_type; // BCF header lookup ID and one of BCF_HL_* types and BCF_HT_* types + int hdr_id, hl_type, ht_type, vl_len; // BCF header lookup ID and one of BCF_HL_*, BCF_HT_*, BCF_VL_* types int idx; // 0-based index to VCF vectors, // -2: list (e.g. [0,1,2] or [1..3] or [1..] or any field[*], which is equivalent to [0..]) // -3: select indices on the fly based on values in GT @@ -169,6 +169,7 @@ struct _filter_t #define TOK_NOT_IN 39 // does not contain, e.g. FILTER!~"A" #define TOK_MODULO 40 // % #define TOK_EXT 41 // external values set before each filter_test_ext() call, can be one of {},{str},{int},{float} +#define TOK_FISHER 42 // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 // ( ) [ < = > ] ! | & + - * / M m a A O ~ ^ S . l f c p b P i s % @@ -221,9 +222,10 @@ static int filters_next_token(char **str, int *len) if ( !strncasecmp(tmp,"STDEV(",6) ) { (*str) += 5; return TOK_STDEV; } if ( !strncasecmp(tmp,"SUM(",4) ) { (*str) += 3; return TOK_SUM; } if ( !strncasecmp(tmp,"ABS(",4) ) { (*str) += 3; return TOK_ABS; } - if ( !strncasecmp(tmp,"COUNT(",4) ) { (*str) += 5; return TOK_CNT; } + if ( !strncasecmp(tmp,"COUNT(",6) ) { (*str) += 5; return TOK_CNT; } if ( !strncasecmp(tmp,"STRLEN(",7) ) { (*str) += 6; return TOK_LEN; } if ( !strncasecmp(tmp,"BINOM(",6) ) { (*str) += 5; return -TOK_BINOM; } + if ( !strncasecmp(tmp,"FISHER(",6) ) { (*str) += 6; return -TOK_FISHER; } if ( !strncasecmp(tmp,"PHRED(",6) ) { (*str) += 5; return TOK_PHRED; } if ( !strncasecmp(tmp,"%MAX(",5) ) { (*str) += 4; return TOK_MAX; } // for backward compatibility if ( !strncasecmp(tmp,"%MIN(",5) ) { (*str) += 4; return TOK_MIN; } // for backward compatibility @@ -326,10 +328,6 @@ static int filters_next_token(char **str, int *len) return TOK_VAL; } -#define FILTER_OK 0 -#define FILTER_ERR_UNKN_TAGS 1 -#define FILTER_ERR_OTHER 2 - static void filter_add_undef_tag(filter_t *filter, char *str) { int i; @@ -1193,12 +1191,9 @@ static void filters_set_format_string(filter_t *flt, bcf1_t *line, token_t *tok) } static void _filters_set_genotype(filter_t *flt, bcf1_t *line, token_t *tok, int type) { + tok->nvalues = tok->str_value.l = 0; bcf_fmt_t *fmt = bcf_get_fmt(flt->hdr, line, "GT"); - if ( !fmt ) - { - tok->nvalues = tok->str_value.l = 0; - return; - } + if ( !fmt ) return; int i,j, nsmpl = bcf_hdr_nsamples(flt->hdr), nvals1 = type==2 ? 3 : 4; if ( tok->str_value.m <= nvals1*nsmpl ) @@ -1278,12 +1273,10 @@ static void filters_set_genotype4(filter_t *flt, bcf1_t *line, token_t *tok) { _ static void filters_set_genotype_string(filter_t *flt, bcf1_t *line, token_t *tok) { + tok->nvalues = tok->str_value.l = 0; bcf_fmt_t *fmt = bcf_get_fmt(flt->hdr, line, "GT"); - if ( !fmt ) - { - tok->nvalues = 0; - return; - } + if ( !fmt ) return; + int i, blen = 4, nsmpl = line->n_sample; gt_length_too_big: @@ -2038,6 +2031,154 @@ static int func_strlen(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **sta } return 1; } +static int func_fisher(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack) +{ + int i, istack = nstack - rtok->nargs; + token_t *tok = stack[istack]; + token_t *tok2 = istack+2==nstack ? stack[istack+1] : NULL; + if ( !tok->nsamples ) + { + // INFO tag, such as DP4 + rtok->nvalues = 1; + hts_expand(double, rtok->nvalues, rtok->mvalues, rtok->values); + double *n11 = NULL, *n12 = NULL, *n21 = NULL, *n22 = NULL; + if ( istack+1==nstack ) + { + // only one tag, expecting four values: binom(INFO/DP4) + if ( tok->nvalues==4 ) + { + n11 = &tok->values[0]; + n12 = &tok->values[1]; + n21 = &tok->values[2]; + n22 = &tok->values[3]; + } + } + else if ( istack+2==nstack ) + { + // two tags, expecting two values in each: binom(INFO/ADF[0,2],INFO/ADR[0,2]) + if ( tok->nvalues>=2 && tok2->nvalues>=2 ) + { + n11 = &tok->values[0]; + n21 = &tok->values[1]; + n12 = &tok2->values[0]; + n22 = &tok2->values[1]; + } + } + if ( !n11 || !n12 || !n21 || !n22 + || bcf_double_is_missing_or_vector_end(n11[0]) + || bcf_double_is_missing_or_vector_end(n12[0]) + || bcf_double_is_missing_or_vector_end(n21[0]) + || bcf_double_is_missing_or_vector_end(n22[0]) ) bcf_double_set_missing(rtok->values[0]); + else + { + double left,right,two; + kt_fisher_exact(n11[0],n12[0],n21[0],n22[0], &left, &right, &two); + rtok->values[0] = two; + } + } + else + { + rtok->nval1 = 1; + rtok->nvalues = tok->nsamples; + rtok->nsamples = tok->nsamples; + hts_expand(double, rtok->nvalues, rtok->mvalues, rtok->values); + assert(tok->usmpl); + if ( !rtok->usmpl ) rtok->usmpl = (uint8_t*) malloc(tok->nsamples); + memcpy(rtok->usmpl, tok->usmpl, tok->nsamples); + + if ( istack+1==nstack && tok->nval1==4 ) + { + // only one tag, expecting four values: fisher(FORMAT/DP4) + for (i=0; insamples; i++) + { + if ( !rtok->usmpl[i] ) continue; + double *n11 = tok->values + tok->nval1*i; + double *n12 = tok->values + tok->nval1*i+1; + double *n21 = tok->values + tok->nval1*i+2; + double *n22 = tok->values + tok->nval1*i+3; + if ( !n11 || !n12 || !n21 || !n22 + || bcf_double_is_missing_or_vector_end(n11[0]) + || bcf_double_is_missing_or_vector_end(n12[0]) + || bcf_double_is_missing_or_vector_end(n21[0]) + || bcf_double_is_missing_or_vector_end(n22[0]) ) bcf_double_set_missing(rtok->values[i]); + else + { + double left,right,two; + kt_fisher_exact(n11[0],n12[0],n21[0],n22[0], &left, &right, &two); + rtok->values[i] = two; + } + } + } + else if ( istack+2==nstack && tok->vl_len==BCF_VL_R && tok2->vl_len==BCF_VL_R && tok->nuidxs+tok2->nuidxs==4 ) + { + // two Number=R tags with explicit indices, e.g. fisher(FORMAT/ADF[:0,1],FORMAT/ADR[:0,1]) + for (i=0; insamples; i++) + { + if ( !rtok->usmpl[i] ) continue; + double *n11 = tok->values + tok->nval1*i; + double *n12 = tok->values + tok->nval1*i+1; + double *n21 = tok2->values + tok2->nval1*i; + double *n22 = tok2->values + tok2->nval1*i+1; + if ( !n11 || !n12 || !n21 || !n22 + || bcf_double_is_missing_or_vector_end(n11[0]) + || bcf_double_is_missing_or_vector_end(n12[0]) + || bcf_double_is_missing_or_vector_end(n21[0]) + || bcf_double_is_missing_or_vector_end(n22[0]) ) bcf_double_set_missing(rtok->values[i]); + else + { + double left,right,two; + kt_fisher_exact(n11[0],n12[0],n21[0],n22[0], &left, &right, &two); + rtok->values[i] = two; + } + } + } + else if ( istack+2==nstack && tok->vl_len==BCF_VL_R && tok2->vl_len==BCF_VL_R ) + { + // two Number=R tags, fisher(FORMAT/ADF,FORMAT/ADR), take thae ALT allele index from GT + int ngt = bcf_get_genotypes(flt->hdr, line, &flt->tmpi, &flt->mtmpi); + int max_ploidy = ngt/line->n_sample; + if ( ngt <= 0 || max_ploidy < 2 ) // GT not present or not diploid, cannot set + { + for (i=0; insamples; i++) + if ( rtok->usmpl[i] ) bcf_double_set_missing(rtok->values[i]); + return rtok->nargs; + } + for (i=0; insamples; i++) + { + if ( !rtok->usmpl[i] ) continue; + int32_t *ptr = flt->tmpi + i*max_ploidy; + if ( bcf_gt_is_missing(ptr[0]) || bcf_gt_is_missing(ptr[1]) || ptr[1]==bcf_int32_vector_end ) + { + bcf_double_set_missing(rtok->values[i]); + continue; + } + int idx1 = bcf_gt_allele(ptr[0]); + int idx2 = bcf_gt_allele(ptr[1]); + if ( idx1>=line->n_allele ) error("Incorrect allele index at %s:%"PRId64", sample %s\n", bcf_seqname(flt->hdr,line),(int64_t) line->pos+1,flt->hdr->samples[i]); + if ( idx2>=line->n_allele ) error("Incorrect allele index at %s:%"PRId64", sample %s\n", bcf_seqname(flt->hdr,line),(int64_t) line->pos+1,flt->hdr->samples[i]); + double *vals = tok->values + tok->nval1*i; + double *vals2 = tok2->values + tok2->nval1*i; + double *n11 = &vals[idx1]; + double *n12 = &vals[idx2]; + double *n21 = &vals2[idx1]; + double *n22 = &vals2[idx2]; + if ( !n11 || !n12 || !n21 || !n22 + || bcf_double_is_missing_or_vector_end(n11[0]) + || bcf_double_is_missing_or_vector_end(n12[0]) + || bcf_double_is_missing_or_vector_end(n21[0]) + || bcf_double_is_missing_or_vector_end(n22[0]) ) + { + bcf_double_set_missing(rtok->values[i]); + continue; + } + double left,right,two; + kt_fisher_exact(n11[0],n12[0],n21[0],n22[0], &left, &right, &two); + rtok->values[i] = two; + } + } + } + return rtok->nargs; +} static int func_binom(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack) { int i, istack = nstack - rtok->nargs; @@ -2183,7 +2324,6 @@ static int func_phred(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stac rtok->nsamples = tok->nsamples; rtok->nval1 = tok->nval1; memcpy(rtok->pass_samples, tok->pass_samples, rtok->nsamples*sizeof(*rtok->pass_samples)); - assert(tok->usmpl); if ( !rtok->usmpl ) { rtok->usmpl = (uint8_t*) malloc(tok->nsamples*sizeof(*rtok->usmpl)); @@ -2620,9 +2760,11 @@ static int _regex_vector_strings(regex_t *regex, char *str, size_t len, int logi char *mid = str; while ( mid < end && *mid && *mid!=',' ) mid++; int miss = mid - str == 1 && str[0]=='.' ? 1 : 0; - if ( miss && missing_logic[miss] ) return 1; + int match = ( miss && missing_logic[miss] ) ? 1 : 0; + if ( logic==TOK_NLIKE ) match = match ? 0 : 1; + if ( match ) return 1; char tmp = *mid; *mid = 0; - int match = regexec(regex, str, 0,NULL,0) ? 0 : 1; + match = regexec(regex, str, 0,NULL,0) ? 0 : 1; *mid = tmp; if ( logic==TOK_NLIKE ) match = match ? 0 : 1; if ( match ) return 1; @@ -2709,6 +2851,7 @@ static void cmp_vector_strings(token_t *atok, token_t *btok, token_t *rtok) { token_t *tok = atok->regex ? btok : atok; rtok->pass_site = _regex_vector_strings(regex, tok->str_value.s, tok->str_value.l, logic, missing_logic); + fprintf(bcftools_stderr,"pass=%d [%s]\n",rtok->pass_site,tok->str_value.s); } return; } @@ -2957,6 +3100,7 @@ static int max_ac_an_unpack(bcf_hdr_t *hdr) } static int filters_init1_ext(filter_t *filter, char *str, int len, token_t *tok) { + tok->vl_len = BCF_VL_FIXED; tok->hl_type = -1; tok->ht_type = -1; tok->tok_type = TOK_VAL; @@ -2973,6 +3117,7 @@ static int filters_init1_ext(filter_t *filter, char *str, int len, token_t *tok) } static int filters_init1(filter_t *filter, char *str, int len, token_t *tok) { + tok->vl_len = BCF_VL_FIXED; tok->ht_type = -1; tok->hl_type = -1; tok->tok_type = TOK_VAL; @@ -3170,6 +3315,7 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok) } tok->hl_type = is_fmt ? BCF_HL_FMT : BCF_HL_INFO; + if ( tok->hdr_id >= 0 ) tok->vl_len = bcf_hdr_id2length(filter->hdr,tok->hl_type,tok->hdr_id); if ( is_fmt ) filter->max_unpack |= BCF_UN_FMT; if ( tok->hdr_id>=0 ) { @@ -3504,6 +3650,49 @@ static void determine_ext_types(filter_t *filter, int ntok, token_t *tok) } } +// Same as hts_readlist but recognizes brackets, () and [], and does not split by comma inside +// Possible todo: the escaping is simplistic, does not check the syntax, so "([)]" is not distinguishable from "([])" +char **parse_tag_list(const char *string, int *_n) +{ + *_n = 0; + unsigned int m = 0, n = 0; + char **s = 0, **s_new; + const char *q = string, *p = string; + int escape_bracket = 0; + while ( 1 ) + { + if ((*p == ',' && !escape_bracket) || *p == 0) + { + if (hts_resize(char*, n + 1, &m, &s, 0) < 0) + goto err; + s[n] = (char*)calloc(p - q + 1, 1); + if (!s[n]) + goto err; + strncpy(s[n++], q, p - q); + q = p + 1; + } + if ( !*p ) break; + if ( *p=='[' || *p=='(' ) escape_bracket++; + if ( (*p==']' || *p==')') && escape_bracket ) escape_bracket--; + p++; + } + + // Try to shrink s to the minimum size needed + s_new = (char**)realloc(s, n * sizeof(char*)); + if (!s_new) + goto err; + + s = s_new; + assert(n < INT_MAX); // hts_resize() should ensure this + *_n = n; + return s; + +err: + for (m = 0; m < n; m++) + free(s[m]); + free(s); + return NULL; +} // Parse filter expression and convert to reverse polish notation. Dijkstra's shunting-yard algorithm static filter_t *filter_init_(bcf_hdr_t *hdr, const char *str, int exit_on_error) @@ -3575,7 +3764,7 @@ static filter_t *filter_init_(bcf_hdr_t *hdr, const char *str, int exit_on_error } else if ( ret == -TOK_FUNC ) { - // this is different from TOK_PERLSUB,TOK_BINOM in that the expression inside the + // this is different from TOK_PERLSUB,TOK_BINOM,TOK_FISHER in that the expression inside the // brackets gets evaluated as normal expression nops++; hts_expand0(token_t, nops, mops, ops); @@ -3599,7 +3788,7 @@ static filter_t *filter_init_(bcf_hdr_t *hdr, const char *str, int exit_on_error else error("The function \"%s\" is not supported\n", tmp-len); continue; } - else if ( ret < 0 ) // variable number of arguments: TOK_PERLSUB,TOK_BINOM + else if ( ret < 0 ) // variable number of arguments: TOK_PERLSUB,TOK_BINOM,TOK_FISHER { ret = -ret; @@ -3611,7 +3800,7 @@ static filter_t *filter_init_(bcf_hdr_t *hdr, const char *str, int exit_on_error if ( ret == TOK_PERLSUB ) { while ( *beg && ((isalnum(*beg) && !ispunct(*beg)) || *beg=='_') ) beg++; - if ( *beg!='(' ) error("Could not parse the expression: %s\n", str); + if ( *beg!='(' ) error("[%s:%d] Could not parse the expression: %s\n", __FILE__,__LINE__,str); // the subroutine name kputc('"', &rmme); @@ -3624,12 +3813,12 @@ static filter_t *filter_init_(bcf_hdr_t *hdr, const char *str, int exit_on_error } char *end = beg; while ( *end && *end!=')' ) end++; - if ( !*end ) error("Could not parse the expression: %s\n", str); + if ( !*end ) error("[%s:%d] Could not parse the expression: %s\n", __FILE__,__LINE__,str); // subroutine arguments rmme.l = 0; kputsn(beg+1, end-beg-1, &rmme); - char **rmme_list = hts_readlist(rmme.s, 0, &margs); + char **rmme_list = parse_tag_list(rmme.s, &margs); for (i=0; iexit_on_error ) + error("[%s:%d %s] Error: could not parse the expression \"%s\"\n", __FILE__,__LINE__,__FUNCTION__,filter->str); + filter->status |= FILTER_ERR_OTHER; + } if ( filter->status != FILTER_OK ) { if ( mops ) free(ops); @@ -3747,6 +3942,7 @@ static filter_t *filter_init_(bcf_hdr_t *hdr, const char *str, int exit_on_error if ( type==BCF_HT_INT ) set_missing = 1; else if ( type==BCF_HT_REAL ) set_missing = 1; } + else if ( !out[k].tag ) error("Error: could not parse the expression\n"); // e.g. =~ else if ( !strcmp("QUAL",out[k].tag) ) set_missing = 1; if ( set_missing ) { out[j].is_str = 0; out[j].is_missing = 1; bcf_double_set_missing(out[j].values[0]); } } @@ -3892,6 +4088,7 @@ static filter_t *filter_init_(bcf_hdr_t *hdr, const char *str, int exit_on_error else if ( out[i].tok_type==TOK_LEN ) { out[i].func = func_strlen; out[i].tok_type = TOK_FUNC; } else if ( out[i].tok_type==TOK_PHRED ) { out[i].func = func_phred; out[i].tok_type = TOK_FUNC; } else if ( out[i].tok_type==TOK_BINOM ) { out[i].func = func_binom; out[i].tok_type = TOK_FUNC; } + else if ( out[i].tok_type==TOK_FISHER ) { out[i].func = func_fisher; out[i].tok_type = TOK_FUNC; } else if ( out[i].tok_type==TOK_PERLSUB ) { out[i].func = perl_exec; out[i].tok_type = TOK_FUNC; } else if ( out[i].tok_type==TOK_sMAX ) { out[i].func = func_smpl_max; out[i].tok_type = TOK_FUNC; } else if ( out[i].tok_type==TOK_sMIN ) { out[i].func = func_smpl_min; out[i].tok_type = TOK_FUNC; } diff --git a/bcftools/gff.c b/bcftools/gff.c index 283ced33..119a6912 100644 --- a/bcftools/gff.c +++ b/bcftools/gff.c @@ -1,6 +1,6 @@ /* The MIT License - Copyright (c) 2023 Genome Research Ltd. + Copyright (c) 2023-2025 Genome Research Ltd. Author: Petr Danecek @@ -87,11 +87,6 @@ typedef struct // mapping from transcript id to tscript, for quick CDS anchoring kh_int2tscript_t *id2tr; - // sequences - void *seq2int; // str2int hash - char **seq; - int nseq, mseq; - // ignored biotypes void *ignored_biotypes; @@ -111,18 +106,25 @@ struct gff_t_ // index iterator regidx_t *idx_cds, *idx_utr, *idx_exon, *idx_tscript; + // str2int hash with parsed sequence names + void *seq2int; + // temporary structures, deleted after initializtion aux_t init; + // sequences + char **seq; + int nseq, mseq; + // mapping between transcript id (eg. Zm00001d027245_T001) and a numeric idx id_tbl_t tscript_ids; - int strip_chr_names, verbosity; + int verbosity; int force; // force run under various conditions. Currently only to skip out-of-phase transcripts struct { int unknown_chr,unknown_tscript_biotype,unknown_strand,unknown_phase,duplicate_id; - int unknown_cds_phase,incomplete_cds,wrong_phase,overlapping_cds; + int unknown_cds_phase,incomplete_cds,wrong_phase,overlapping_cds,ftr_out_of_bounds; } warned; }; @@ -158,12 +160,6 @@ int gff_set(gff_t *gff, gff_opt_t key, ...) va_end(args); return 0; - case strip_chr_names: - va_start(args, key); - gff->strip_chr_names = va_arg(args,int); - va_end(args); - return 0; - case verbosity: va_start(args, key); gff->verbosity = va_arg(args,int); @@ -212,18 +208,17 @@ const char *gf_type2gff_string(int type) */ static inline int feature_set_seq(gff_t *gff, char *chr_beg, char *chr_end) { - aux_t *aux = &gff->init; char tmp = chr_end[1]; chr_end[1] = 0; int iseq; - if ( khash_str2int_get(aux->seq2int, chr_beg, &iseq)!=0 ) + if ( khash_str2int_get(gff->seq2int, chr_beg, &iseq)!=0 ) { char *new_chr = strdup(chr_beg); - hts_expand(char*, aux->nseq+1, aux->mseq, aux->seq); - aux->seq[aux->nseq] = new_chr; - iseq = khash_str2int_inc(aux->seq2int, aux->seq[aux->nseq]); - aux->nseq++; - assert( aux->nseq < 1<<29 ); // see gf_gene_t.iseq and ftr_t.iseq + hts_expand(char*, gff->nseq+1, gff->mseq, gff->seq); + gff->seq[gff->nseq] = new_chr; + iseq = khash_str2int_inc(gff->seq2int, gff->seq[gff->nseq]); + gff->nseq++; + assert( gff->nseq < 1<<29 ); // see gf_gene_t.iseq and ftr_t.iseq } chr_end[1] = tmp; return iseq; @@ -239,7 +234,6 @@ static inline void gff_parse_chr(gff_t *gff, const char *line, char **chr_beg, c char *se = (char*) line; while ( *se && *se!='\t' ) se++; if ( !*se ) error("[%s:%d %s] Could not parse the line: %s\n",__FILE__,__LINE__,__FUNCTION__,line); - if ( gff->strip_chr_names && !strncasecmp("chr",line,3) ) line += 3; *chr_beg = (char*) line; *chr_end = se-1; } @@ -633,9 +627,9 @@ static int cmp_cds_ptr(const void *a, const void *b) return 0; } -static inline void chr_beg_end(aux_t *aux, int iseq, char **chr_beg, char **chr_end) +static inline void chr_beg_end(gff_t *gff, int iseq, char **chr_beg, char **chr_end) { - *chr_beg = *chr_end = aux->seq[iseq]; + *chr_beg = *chr_end = gff->seq[iseq]; while ( (*chr_end)[1] ) (*chr_end)++; } static gf_tscript_t *tscript_init(aux_t *aux, uint32_t trid) @@ -674,7 +668,7 @@ static void register_utr(gff_t *gff, ftr_t *ftr) utr->tr = tscript_init(aux, ftr->trid); char *chr_beg, *chr_end; - chr_beg_end(&gff->init, utr->tr->gene->iseq, &chr_beg, &chr_end); + chr_beg_end(gff, utr->tr->gene->iseq, &chr_beg, &chr_end); regidx_push(gff->idx_utr, chr_beg,chr_end, utr->beg,utr->end, &utr); } static void register_exon(gff_t *gff, ftr_t *ftr) @@ -686,7 +680,7 @@ static void register_exon(gff_t *gff, ftr_t *ftr) exon->tr = tscript_init(aux, ftr->trid); char *chr_beg, *chr_end; - chr_beg_end(&gff->init, exon->tr->gene->iseq, &chr_beg, &chr_end); + chr_beg_end(gff, exon->tr->gene->iseq, &chr_beg, &chr_end); regidx_push(gff->idx_exon, chr_beg,chr_end, exon->beg - N_SPLICE_REGION_INTRON, exon->end + N_SPLICE_REGION_INTRON, &exon); } @@ -703,7 +697,7 @@ static void tscript_init_cds(gff_t *gff) // position-to-tscript lookup char *chr_beg, *chr_end; - chr_beg_end(aux, tr->gene->iseq, &chr_beg, &chr_end); + chr_beg_end(gff, tr->gene->iseq, &chr_beg, &chr_end); regidx_push(gff->idx_tscript, chr_beg, chr_end, tr->beg, tr->end, &tr); if ( !tr->ncds ) continue; // transcript with no CDS @@ -914,7 +908,7 @@ static int gff_dump(gff_t *gff, const char *fname) gf_gene_t *gene = (gf_gene_t*) kh_val(gff->init.gid2gene, k); char *gene_id = gff->init.gene_ids.str[gene->id]; str.l = 0; - ksprintf(&str,"%s\t.\tgene\t%"PRIu32"\t%"PRIu32"\t.\t%c\t.\tID=%s;Name=%s;used=%d\n",gff->init.seq[gene->iseq],gene->beg+1,gene->end+1,gene->strand==STRAND_FWD?'+':(gene->strand==STRAND_REV?'-':'.'),gene_id,gene->name,gene->used); + ksprintf(&str,"%s\t.\tgene\t%"PRIu32"\t%"PRIu32"\t.\t%c\t.\tID=%s;Name=%s;used=%d\n",gff->seq[gene->iseq],gene->beg+1,gene->end+1,gene->strand==STRAND_FWD?'+':(gene->strand==STRAND_REV?'-':'.'),gene_id,gene->name,gene->used); if ( bgzf_write(out, str.s, str.l) != str.l ) error("Error writing %s: %s\n", fname, strerror(errno)); } @@ -974,7 +968,7 @@ int gff_parse(gff_t *gff) if ( gff->verbosity > 0 ) fprintf(stderr,"Parsing %s ...\n", gff->fname); aux_t *aux = &gff->init; - aux->seq2int = khash_str2int_init(); // chrom's numeric id + gff->seq2int = khash_str2int_init(); // chrom's numeric id aux->gid2gene = kh_init(int2gene); // gene id to gf_gene_t, for idx_gene aux->id2tr = kh_init(int2tscript); // transcript id to tscript_t gff->idx_tscript = regidx_init(NULL, NULL, regidx_free_tscript, sizeof(gf_tscript_t*), NULL); @@ -1010,7 +1004,16 @@ int gff_parse(gff_t *gff) khint_t k = kh_get(int2tscript, aux->id2tr, (int)ftr->trid); if ( k==kh_end(aux->id2tr) ) continue; // no corresponding transcript registered, must be an unsupported biotype + // check whether the feature respects transcript's beg,end coordinates gf_tscript_t *tr = kh_val(aux->id2tr,k); + if ( ftr->beg < tr->beg || ftr->end > tr->end ) + { + if ( !gff->warned.ftr_out_of_bounds || gff->verbosity > 1 ) + fprintf(stderr,"Warning: The GFF contains features outside the transcript boundaries .. %s\n",gff_id2string(gff,transcript,tr->id)); + gff->warned.ftr_out_of_bounds++; + if ( ftr->beg < tr->beg ) tr->beg = ftr->beg; + if ( ftr->end > tr->end ) tr->end = ftr->end; + } tr->used = 1; tr->gene->used = 1; @@ -1022,7 +1025,7 @@ int gff_parse(gff_t *gff) else if ( ftr->type==GF_UTR5 ) register_utr(gff, ftr); else if ( ftr->type==GF_UTR3 ) register_utr(gff, ftr); else - error("something: %s\t%"PRIu32"\t%"PRIu32"\t%s\t%s\n", aux->seq[ftr->iseq],ftr->beg+1,ftr->end+1,gff->tscript_ids.str[ftr->trid],gf_type2gff_string(ftr->type)); + error("something: %s\t%"PRIu32"\t%"PRIu32"\t%s\t%s\n", gff->seq[ftr->iseq],ftr->beg+1,ftr->end+1,gff->tscript_ids.str[ftr->trid],gf_type2gff_string(ftr->type)); } tscript_init_cds(gff); @@ -1074,9 +1077,7 @@ int gff_parse(gff_t *gff) " or misc/gff2gff.py script can fix the problem (both do different things). See also the man page for the description\n" " of the expected format http://samtools.github.io/bcftools/bcftools-man.html#csq\n"); - free(aux->seq); free(aux->ftr); - khash_str2int_destroy_free(aux->seq2int); // keeping only to destroy the genes at the end: kh_destroy(int2gene,aux->gid2gene); kh_destroy(int2tscript,aux->id2tr); gff_id_destroy(&aux->gene_ids); @@ -1110,7 +1111,20 @@ void gff_destroy(gff_t *gff) regidx_destroy(gff->idx_exon); regidx_destroy(gff->idx_tscript); + khash_str2int_destroy_free(gff->seq2int); gff_id_destroy(&gff->tscript_ids); + free(gff->seq); free(gff); } - +int gff_has_seq(gff_t *gff, const char *seq) +{ + return khash_str2int_has_key(gff->seq2int, seq); +} +int gff_nseq(gff_t *gff) +{ + return gff->nseq; +} +const char *gff_iseq(gff_t *gff, int i) +{ + return i>=0 && inseq ? gff->seq[i] : NULL; +} diff --git a/bcftools/gff.c.pysam.c b/bcftools/gff.c.pysam.c index 3722f606..03aea831 100644 --- a/bcftools/gff.c.pysam.c +++ b/bcftools/gff.c.pysam.c @@ -2,7 +2,7 @@ /* The MIT License - Copyright (c) 2023 Genome Research Ltd. + Copyright (c) 2023-2025 Genome Research Ltd. Author: Petr Danecek @@ -89,11 +89,6 @@ typedef struct // mapping from transcript id to tscript, for quick CDS anchoring kh_int2tscript_t *id2tr; - // sequences - void *seq2int; // str2int hash - char **seq; - int nseq, mseq; - // ignored biotypes void *ignored_biotypes; @@ -113,18 +108,25 @@ struct gff_t_ // index iterator regidx_t *idx_cds, *idx_utr, *idx_exon, *idx_tscript; + // str2int hash with parsed sequence names + void *seq2int; + // temporary structures, deleted after initializtion aux_t init; + // sequences + char **seq; + int nseq, mseq; + // mapping between transcript id (eg. Zm00001d027245_T001) and a numeric idx id_tbl_t tscript_ids; - int strip_chr_names, verbosity; + int verbosity; int force; // force run under various conditions. Currently only to skip out-of-phase transcripts struct { int unknown_chr,unknown_tscript_biotype,unknown_strand,unknown_phase,duplicate_id; - int unknown_cds_phase,incomplete_cds,wrong_phase,overlapping_cds; + int unknown_cds_phase,incomplete_cds,wrong_phase,overlapping_cds,ftr_out_of_bounds; } warned; }; @@ -160,12 +162,6 @@ int gff_set(gff_t *gff, gff_opt_t key, ...) va_end(args); return 0; - case strip_chr_names: - va_start(args, key); - gff->strip_chr_names = va_arg(args,int); - va_end(args); - return 0; - case verbosity: va_start(args, key); gff->verbosity = va_arg(args,int); @@ -214,18 +210,17 @@ const char *gf_type2gff_string(int type) */ static inline int feature_set_seq(gff_t *gff, char *chr_beg, char *chr_end) { - aux_t *aux = &gff->init; char tmp = chr_end[1]; chr_end[1] = 0; int iseq; - if ( khash_str2int_get(aux->seq2int, chr_beg, &iseq)!=0 ) + if ( khash_str2int_get(gff->seq2int, chr_beg, &iseq)!=0 ) { char *new_chr = strdup(chr_beg); - hts_expand(char*, aux->nseq+1, aux->mseq, aux->seq); - aux->seq[aux->nseq] = new_chr; - iseq = khash_str2int_inc(aux->seq2int, aux->seq[aux->nseq]); - aux->nseq++; - assert( aux->nseq < 1<<29 ); // see gf_gene_t.iseq and ftr_t.iseq + hts_expand(char*, gff->nseq+1, gff->mseq, gff->seq); + gff->seq[gff->nseq] = new_chr; + iseq = khash_str2int_inc(gff->seq2int, gff->seq[gff->nseq]); + gff->nseq++; + assert( gff->nseq < 1<<29 ); // see gf_gene_t.iseq and ftr_t.iseq } chr_end[1] = tmp; return iseq; @@ -241,7 +236,6 @@ static inline void gff_parse_chr(gff_t *gff, const char *line, char **chr_beg, c char *se = (char*) line; while ( *se && *se!='\t' ) se++; if ( !*se ) error("[%s:%d %s] Could not parse the line: %s\n",__FILE__,__LINE__,__FUNCTION__,line); - if ( gff->strip_chr_names && !strncasecmp("chr",line,3) ) line += 3; *chr_beg = (char*) line; *chr_end = se-1; } @@ -635,9 +629,9 @@ static int cmp_cds_ptr(const void *a, const void *b) return 0; } -static inline void chr_beg_end(aux_t *aux, int iseq, char **chr_beg, char **chr_end) +static inline void chr_beg_end(gff_t *gff, int iseq, char **chr_beg, char **chr_end) { - *chr_beg = *chr_end = aux->seq[iseq]; + *chr_beg = *chr_end = gff->seq[iseq]; while ( (*chr_end)[1] ) (*chr_end)++; } static gf_tscript_t *tscript_init(aux_t *aux, uint32_t trid) @@ -676,7 +670,7 @@ static void register_utr(gff_t *gff, ftr_t *ftr) utr->tr = tscript_init(aux, ftr->trid); char *chr_beg, *chr_end; - chr_beg_end(&gff->init, utr->tr->gene->iseq, &chr_beg, &chr_end); + chr_beg_end(gff, utr->tr->gene->iseq, &chr_beg, &chr_end); regidx_push(gff->idx_utr, chr_beg,chr_end, utr->beg,utr->end, &utr); } static void register_exon(gff_t *gff, ftr_t *ftr) @@ -688,7 +682,7 @@ static void register_exon(gff_t *gff, ftr_t *ftr) exon->tr = tscript_init(aux, ftr->trid); char *chr_beg, *chr_end; - chr_beg_end(&gff->init, exon->tr->gene->iseq, &chr_beg, &chr_end); + chr_beg_end(gff, exon->tr->gene->iseq, &chr_beg, &chr_end); regidx_push(gff->idx_exon, chr_beg,chr_end, exon->beg - N_SPLICE_REGION_INTRON, exon->end + N_SPLICE_REGION_INTRON, &exon); } @@ -705,7 +699,7 @@ static void tscript_init_cds(gff_t *gff) // position-to-tscript lookup char *chr_beg, *chr_end; - chr_beg_end(aux, tr->gene->iseq, &chr_beg, &chr_end); + chr_beg_end(gff, tr->gene->iseq, &chr_beg, &chr_end); regidx_push(gff->idx_tscript, chr_beg, chr_end, tr->beg, tr->end, &tr); if ( !tr->ncds ) continue; // transcript with no CDS @@ -916,7 +910,7 @@ static int gff_dump(gff_t *gff, const char *fname) gf_gene_t *gene = (gf_gene_t*) kh_val(gff->init.gid2gene, k); char *gene_id = gff->init.gene_ids.str[gene->id]; str.l = 0; - ksprintf(&str,"%s\t.\tgene\t%"PRIu32"\t%"PRIu32"\t.\t%c\t.\tID=%s;Name=%s;used=%d\n",gff->init.seq[gene->iseq],gene->beg+1,gene->end+1,gene->strand==STRAND_FWD?'+':(gene->strand==STRAND_REV?'-':'.'),gene_id,gene->name,gene->used); + ksprintf(&str,"%s\t.\tgene\t%"PRIu32"\t%"PRIu32"\t.\t%c\t.\tID=%s;Name=%s;used=%d\n",gff->seq[gene->iseq],gene->beg+1,gene->end+1,gene->strand==STRAND_FWD?'+':(gene->strand==STRAND_REV?'-':'.'),gene_id,gene->name,gene->used); if ( bgzf_write(out, str.s, str.l) != str.l ) error("Error writing %s: %s\n", fname, strerror(errno)); } @@ -976,7 +970,7 @@ int gff_parse(gff_t *gff) if ( gff->verbosity > 0 ) fprintf(bcftools_stderr,"Parsing %s ...\n", gff->fname); aux_t *aux = &gff->init; - aux->seq2int = khash_str2int_init(); // chrom's numeric id + gff->seq2int = khash_str2int_init(); // chrom's numeric id aux->gid2gene = kh_init(int2gene); // gene id to gf_gene_t, for idx_gene aux->id2tr = kh_init(int2tscript); // transcript id to tscript_t gff->idx_tscript = regidx_init(NULL, NULL, regidx_free_tscript, sizeof(gf_tscript_t*), NULL); @@ -1012,7 +1006,16 @@ int gff_parse(gff_t *gff) khint_t k = kh_get(int2tscript, aux->id2tr, (int)ftr->trid); if ( k==kh_end(aux->id2tr) ) continue; // no corresponding transcript registered, must be an unsupported biotype + // check whether the feature respects transcript's beg,end coordinates gf_tscript_t *tr = kh_val(aux->id2tr,k); + if ( ftr->beg < tr->beg || ftr->end > tr->end ) + { + if ( !gff->warned.ftr_out_of_bounds || gff->verbosity > 1 ) + fprintf(bcftools_stderr,"Warning: The GFF contains features outside the transcript boundaries .. %s\n",gff_id2string(gff,transcript,tr->id)); + gff->warned.ftr_out_of_bounds++; + if ( ftr->beg < tr->beg ) tr->beg = ftr->beg; + if ( ftr->end > tr->end ) tr->end = ftr->end; + } tr->used = 1; tr->gene->used = 1; @@ -1024,7 +1027,7 @@ int gff_parse(gff_t *gff) else if ( ftr->type==GF_UTR5 ) register_utr(gff, ftr); else if ( ftr->type==GF_UTR3 ) register_utr(gff, ftr); else - error("something: %s\t%"PRIu32"\t%"PRIu32"\t%s\t%s\n", aux->seq[ftr->iseq],ftr->beg+1,ftr->end+1,gff->tscript_ids.str[ftr->trid],gf_type2gff_string(ftr->type)); + error("something: %s\t%"PRIu32"\t%"PRIu32"\t%s\t%s\n", gff->seq[ftr->iseq],ftr->beg+1,ftr->end+1,gff->tscript_ids.str[ftr->trid],gf_type2gff_string(ftr->type)); } tscript_init_cds(gff); @@ -1076,9 +1079,7 @@ int gff_parse(gff_t *gff) " or misc/gff2gff.py script can fix the problem (both do different things). See also the man page for the description\n" " of the expected format http://samtools.github.io/bcftools/bcftools-man.html#csq\n"); - free(aux->seq); free(aux->ftr); - khash_str2int_destroy_free(aux->seq2int); // keeping only to destroy the genes at the end: kh_destroy(int2gene,aux->gid2gene); kh_destroy(int2tscript,aux->id2tr); gff_id_destroy(&aux->gene_ids); @@ -1112,7 +1113,20 @@ void gff_destroy(gff_t *gff) regidx_destroy(gff->idx_exon); regidx_destroy(gff->idx_tscript); + khash_str2int_destroy_free(gff->seq2int); gff_id_destroy(&gff->tscript_ids); + free(gff->seq); free(gff); } - +int gff_has_seq(gff_t *gff, const char *seq) +{ + return khash_str2int_has_key(gff->seq2int, seq); +} +int gff_nseq(gff_t *gff) +{ + return gff->nseq; +} +const char *gff_iseq(gff_t *gff, int i) +{ + return i>=0 && inseq ? gff->seq[i] : NULL; +} diff --git a/bcftools/gff.h b/bcftools/gff.h index afa945e8..ddde687d 100644 --- a/bcftools/gff.h +++ b/bcftools/gff.h @@ -1,6 +1,6 @@ /* The MIT License - Copyright (c) 2023-2024 Genome Research Ltd. + Copyright (c) 2023-2025 Genome Research Ltd. Author: Petr Danecek @@ -289,7 +289,6 @@ typedef enum { // write options verbosity, // int, 0-2 - strip_chr_names, // int, 0 to leave as is, 1 to strip 'chr' prefix force_out_of_phase, // int, 1 to proceed even CDS exon out of expected phase dump_fname, // const char*, dump the parsed GFF into this file, for debugging purposes @@ -314,4 +313,9 @@ void *gff_get(gff_t *gff, gff_opt_t key); const char *gff_id2string(gff_t *gff, id_type_t type, int id); const char *gf_type2gff_string(int type); +int gff_has_seq(gff_t *gff, const char *chr); +int gff_nseq(gff_t *gff); +const char *gff_iseq(gff_t *gff, int i); + + #endif diff --git a/bcftools/main.c b/bcftools/main.c index 14357373..6de53680 100644 --- a/bcftools/main.c +++ b/bcftools/main.c @@ -265,7 +265,7 @@ int main(int argc, char *argv[]) if (argc < 2) { usage(stderr); return 1; } if (strcmp(argv[1], "version") == 0 || strcmp(argv[1], "--version") == 0 || strcmp(argv[1], "-v") == 0) { - printf("bcftools %s\nUsing htslib %s\nCopyright (C) 2024 Genome Research Ltd.\n", bcftools_version(), hts_version()); + printf("bcftools %s\nUsing htslib %s\nCopyright (C) 2025 Genome Research Ltd.\n", bcftools_version(), hts_version()); #if USE_GPL printf("License GPLv3+: GNU GPL version 3 or later \n"); #else diff --git a/bcftools/main.c.pysam.c b/bcftools/main.c.pysam.c index 56174fa2..9f720bae 100644 --- a/bcftools/main.c.pysam.c +++ b/bcftools/main.c.pysam.c @@ -267,7 +267,7 @@ int bcftools_main(int argc, char *argv[]) if (argc < 2) { usage(bcftools_stderr); return 1; } if (strcmp(argv[1], "version") == 0 || strcmp(argv[1], "--version") == 0 || strcmp(argv[1], "-v") == 0) { - fprintf(bcftools_stdout, "bcftools %s\nUsing htslib %s\nCopyright (C) 2024 Genome Research Ltd.\n", bcftools_version(), hts_version()); + fprintf(bcftools_stdout, "bcftools %s\nUsing htslib %s\nCopyright (C) 2025 Genome Research Ltd.\n", bcftools_version(), hts_version()); #if USE_GPL fprintf(bcftools_stdout, "License GPLv3+: GNU GPL version 3 or later \n"); #else diff --git a/bcftools/mpileup.c b/bcftools/mpileup.c index 943e0f6f..f87048ea 100644 --- a/bcftools/mpileup.c +++ b/bcftools/mpileup.c @@ -1,6 +1,6 @@ /* mpileup.c -- mpileup subcommand. Previously bam_plcmd.c from samtools - Copyright (C) 2008-2024 Genome Research Ltd. + Copyright (C) 2008-2025 Genome Research Ltd. Portions copyright (C) 2009-2012 Broad Institute. Author: Heng Li @@ -651,6 +651,7 @@ static int mpileup(mplp_conf_t *conf) exit(EXIT_FAILURE); } } + regidx_set(conf->reg,merge_overlaps,1); nregs = regidx_nregs(conf->reg); if ( nregs ) { @@ -766,20 +767,20 @@ static int mpileup(mplp_conf_t *conf) if (conf->record_cmd_line) { ksprintf(&conf->buf, "##bcftoolsVersion=%s+htslib-%s\n",bcftools_version(),hts_version()); - bcf_hdr_append(conf->bcf_hdr, conf->buf.s); + if ( bcf_hdr_append(conf->bcf_hdr, conf->buf.s) ) error("[%s:%d] failed to update the header\n",__FILE__,__LINE__); conf->buf.l = 0; ksprintf(&conf->buf, "##bcftoolsCommand=mpileup"); for (i=1; iargc; i++) ksprintf(&conf->buf, " %s", conf->argv[i]); kputc('\n', &conf->buf); - bcf_hdr_append(conf->bcf_hdr, conf->buf.s); + if ( bcf_hdr_append(conf->bcf_hdr, conf->buf.s) ) error("[%s:%d] failed to update the header\n",__FILE__,__LINE__); } if (conf->fai_fname) { conf->buf.l = 0; ksprintf(&conf->buf, "##reference=file://%s\n", conf->fai_fname); - bcf_hdr_append(conf->bcf_hdr, conf->buf.s); + if ( bcf_hdr_append(conf->bcf_hdr, conf->buf.s) ) error("[%s:%d] failed to update the header\n",__FILE__,__LINE__); } // Translate BAM @SQ tags to BCF ##contig tags @@ -788,7 +789,7 @@ static int mpileup(mplp_conf_t *conf) { conf->buf.l = 0; ksprintf(&conf->buf, "##contig=", hdr->target_name[i], hdr->target_len[i]); - bcf_hdr_append(conf->bcf_hdr, conf->buf.s); + if ( bcf_hdr_append(conf->bcf_hdr, conf->buf.s) ) error("[%s:%d] failed to update the header\n",__FILE__,__LINE__); } conf->buf.l = 0; @@ -1269,6 +1270,7 @@ static void print_usage(FILE *fp, const mplp_conf_t *mplp) " -O, --output-type TYPE 'b' compressed BCF; 'u' uncompressed BCF;\n" " 'z' compressed VCF; 'v' uncompressed VCF; 0-9 compression level [v]\n" " --threads INT Use multithreading with INT worker threads [0]\n" + " -v, --verbosity INT Verbosity level\n" " -W, --write-index[=FMT] Automatically index the output files [off]\n" "\n" "SNP/INDEL genotype likelihoods options:\n" @@ -1464,10 +1466,14 @@ int main_mpileup(int argc, char *argv[]) {"no-poly-mqual", no_argument, NULL, 26}, {"score-vs-ref",required_argument, NULL, 27}, {"seqq-offset", required_argument, NULL, 28}, + {"verbosity",required_argument,NULL,'v'}, {NULL, 0, NULL, 0} }; - while ((c = getopt_long(argc, argv, "Ag:f:r:R:q:Q:C:BDd:L:b:P:po:e:h:Im:F:EG:6O:xa:s:S:t:T:M:X:UW::",lopts,NULL)) >= 0) { + while ((c = getopt_long(argc, argv, "Ag:f:r:R:q:Q:C:BDd:L:b:P:po:e:h:Im:F:EG:6O:xa:s:S:t:T:M:X:UW::v:",lopts,NULL)) >= 0) { switch (c) { + case 'v': + if ( apply_verbosity(optarg) < 0 ) error("Could not parse argument: --verbosity %s\n", optarg); + break; case 'x': mplp.flag &= ~MPLP_SMART_OVERLAPS; break; case 16 : mplp.rflag_skip_any_unset = bam_str2flag(optarg); diff --git a/bcftools/mpileup.c.pysam.c b/bcftools/mpileup.c.pysam.c index 4458b60f..099efcd0 100644 --- a/bcftools/mpileup.c.pysam.c +++ b/bcftools/mpileup.c.pysam.c @@ -2,7 +2,7 @@ /* mpileup.c -- mpileup subcommand. Previously bam_plcmd.c from samtools - Copyright (C) 2008-2024 Genome Research Ltd. + Copyright (C) 2008-2025 Genome Research Ltd. Portions copyright (C) 2009-2012 Broad Institute. Author: Heng Li @@ -653,6 +653,7 @@ static int mpileup(mplp_conf_t *conf) bcftools_exit(EXIT_FAILURE); } } + regidx_set(conf->reg,merge_overlaps,1); nregs = regidx_nregs(conf->reg); if ( nregs ) { @@ -768,20 +769,20 @@ static int mpileup(mplp_conf_t *conf) if (conf->record_cmd_line) { ksprintf(&conf->buf, "##bcftoolsVersion=%s+htslib-%s\n",bcftools_version(),hts_version()); - bcf_hdr_append(conf->bcf_hdr, conf->buf.s); + if ( bcf_hdr_append(conf->bcf_hdr, conf->buf.s) ) error("[%s:%d] failed to update the header\n",__FILE__,__LINE__); conf->buf.l = 0; ksprintf(&conf->buf, "##bcftoolsCommand=mpileup"); for (i=1; iargc; i++) ksprintf(&conf->buf, " %s", conf->argv[i]); kputc('\n', &conf->buf); - bcf_hdr_append(conf->bcf_hdr, conf->buf.s); + if ( bcf_hdr_append(conf->bcf_hdr, conf->buf.s) ) error("[%s:%d] failed to update the header\n",__FILE__,__LINE__); } if (conf->fai_fname) { conf->buf.l = 0; ksprintf(&conf->buf, "##reference=file://%s\n", conf->fai_fname); - bcf_hdr_append(conf->bcf_hdr, conf->buf.s); + if ( bcf_hdr_append(conf->bcf_hdr, conf->buf.s) ) error("[%s:%d] failed to update the header\n",__FILE__,__LINE__); } // Translate BAM @SQ tags to BCF ##contig tags @@ -790,7 +791,7 @@ static int mpileup(mplp_conf_t *conf) { conf->buf.l = 0; ksprintf(&conf->buf, "##contig=", hdr->target_name[i], hdr->target_len[i]); - bcf_hdr_append(conf->bcf_hdr, conf->buf.s); + if ( bcf_hdr_append(conf->bcf_hdr, conf->buf.s) ) error("[%s:%d] failed to update the header\n",__FILE__,__LINE__); } conf->buf.l = 0; @@ -1271,6 +1272,7 @@ static void print_usage(FILE *fp, const mplp_conf_t *mplp) " -O, --output-type TYPE 'b' compressed BCF; 'u' uncompressed BCF;\n" " 'z' compressed VCF; 'v' uncompressed VCF; 0-9 compression level [v]\n" " --threads INT Use multithreading with INT worker threads [0]\n" + " -v, --verbosity INT Verbosity level\n" " -W, --write-index[=FMT] Automatically index the output files [off]\n" "\n" "SNP/INDEL genotype likelihoods options:\n" @@ -1466,10 +1468,14 @@ int main_mpileup(int argc, char *argv[]) {"no-poly-mqual", no_argument, NULL, 26}, {"score-vs-ref",required_argument, NULL, 27}, {"seqq-offset", required_argument, NULL, 28}, + {"verbosity",required_argument,NULL,'v'}, {NULL, 0, NULL, 0} }; - while ((c = getopt_long(argc, argv, "Ag:f:r:R:q:Q:C:BDd:L:b:P:po:e:h:Im:F:EG:6O:xa:s:S:t:T:M:X:UW::",lopts,NULL)) >= 0) { + while ((c = getopt_long(argc, argv, "Ag:f:r:R:q:Q:C:BDd:L:b:P:po:e:h:Im:F:EG:6O:xa:s:S:t:T:M:X:UW::v:",lopts,NULL)) >= 0) { switch (c) { + case 'v': + if ( apply_verbosity(optarg) < 0 ) error("Could not parse argument: --verbosity %s\n", optarg); + break; case 'x': mplp.flag &= ~MPLP_SMART_OVERLAPS; break; case 16 : mplp.rflag_skip_any_unset = bam_str2flag(optarg); diff --git a/bcftools/ploidy.c b/bcftools/ploidy.c index 550ba876..d0884dcc 100644 --- a/bcftools/ploidy.c +++ b/bcftools/ploidy.c @@ -1,5 +1,5 @@ -/* - Copyright (C) 2014-2016 Genome Research Ltd. +/* + Copyright (C) 2014-2025 Genome Research Ltd. Author: Petr Danecek @@ -9,10 +9,10 @@ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - + The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE @@ -58,7 +58,7 @@ int ploidy_parse(const char *line, char **chr_beg, char **chr_end, uint32_t *beg ploidy_t *ploidy = (ploidy_t*) usr; void *sex2id = ploidy->sex2id; - // Check for special case of default ploidy "* * * " + // Check for special case of default ploidy "* * * SEX PLOIDY" int default_ploidy_def = 0; char *ss = (char*) line; @@ -112,7 +112,7 @@ int ploidy_parse(const char *line, char **chr_beg, char **chr_end, uint32_t *beg // Special case, chr="*" stands for a default value if ( default_ploidy_def ) { - ploidy->sex2dflt[ploidy->nsex-1] = sp->ploidy; + ploidy->sex2dflt[sp->sex] = sp->ploidy; return -1; } @@ -212,7 +212,7 @@ int ploidy_query(ploidy_t *ploidy, char *seq, int pos, int *sex2ploidy, int *min { int sex = regitr_payload(ploidy->itr,sex_ploidy_t).sex; int pld = regitr_payload(ploidy->itr,sex_ploidy_t).ploidy; - if ( pld!=ploidy->dflt ) + if ( pld!=ploidy->dflt ) { if ( sex2ploidy ) sex2ploidy[ sex ] = pld; if ( _min > pld ) _min = pld; @@ -266,3 +266,23 @@ int ploidy_min(ploidy_t *ploidy) return ploidy->dflt < ploidy->min ? ploidy->dflt : ploidy->min; } +char *ploidy_format(ploidy_t *ploidy) +{ + kstring_t str = {0,0,0}; + + regitr_t *itr = regitr_init(ploidy->idx); + while ( regitr_loop(itr) ) + { + int id = regitr_payload(itr,sex_ploidy_t).sex; + int pld = regitr_payload(itr,sex_ploidy_t).ploidy; + ksprintf(&str,"%s\t%d\t%d\t%s\t%d\n", itr->seq, itr->beg+1, itr->end+1, ploidy->id2sex[id],pld); + } + regitr_destroy(itr); + + int i; + for (i=0; insex; i++) + ksprintf(&str,"*\t*\t*\t%s\t%d\n", ploidy->id2sex[i],ploidy->sex2dflt[i]); + + return str.s; +} + diff --git a/bcftools/ploidy.c.pysam.c b/bcftools/ploidy.c.pysam.c index aee0c567..02b34be1 100644 --- a/bcftools/ploidy.c.pysam.c +++ b/bcftools/ploidy.c.pysam.c @@ -1,7 +1,7 @@ #include "bcftools.pysam.h" -/* - Copyright (C) 2014-2016 Genome Research Ltd. +/* + Copyright (C) 2014-2025 Genome Research Ltd. Author: Petr Danecek @@ -11,10 +11,10 @@ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - + The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE @@ -60,7 +60,7 @@ int ploidy_parse(const char *line, char **chr_beg, char **chr_end, uint32_t *beg ploidy_t *ploidy = (ploidy_t*) usr; void *sex2id = ploidy->sex2id; - // Check for special case of default ploidy "* * * " + // Check for special case of default ploidy "* * * SEX PLOIDY" int default_ploidy_def = 0; char *ss = (char*) line; @@ -114,7 +114,7 @@ int ploidy_parse(const char *line, char **chr_beg, char **chr_end, uint32_t *beg // Special case, chr="*" stands for a default value if ( default_ploidy_def ) { - ploidy->sex2dflt[ploidy->nsex-1] = sp->ploidy; + ploidy->sex2dflt[sp->sex] = sp->ploidy; return -1; } @@ -214,7 +214,7 @@ int ploidy_query(ploidy_t *ploidy, char *seq, int pos, int *sex2ploidy, int *min { int sex = regitr_payload(ploidy->itr,sex_ploidy_t).sex; int pld = regitr_payload(ploidy->itr,sex_ploidy_t).ploidy; - if ( pld!=ploidy->dflt ) + if ( pld!=ploidy->dflt ) { if ( sex2ploidy ) sex2ploidy[ sex ] = pld; if ( _min > pld ) _min = pld; @@ -268,3 +268,23 @@ int ploidy_min(ploidy_t *ploidy) return ploidy->dflt < ploidy->min ? ploidy->dflt : ploidy->min; } +char *ploidy_format(ploidy_t *ploidy) +{ + kstring_t str = {0,0,0}; + + regitr_t *itr = regitr_init(ploidy->idx); + while ( regitr_loop(itr) ) + { + int id = regitr_payload(itr,sex_ploidy_t).sex; + int pld = regitr_payload(itr,sex_ploidy_t).ploidy; + ksprintf(&str,"%s\t%d\t%d\t%s\t%d\n", itr->seq, itr->beg+1, itr->end+1, ploidy->id2sex[id],pld); + } + regitr_destroy(itr); + + int i; + for (i=0; insex; i++) + ksprintf(&str,"*\t*\t*\t%s\t%d\n", ploidy->id2sex[i],ploidy->sex2dflt[i]); + + return str.s; +} + diff --git a/bcftools/ploidy.h b/bcftools/ploidy.h index 7697c65f..7625bd06 100644 --- a/bcftools/ploidy.h +++ b/bcftools/ploidy.h @@ -1,5 +1,5 @@ -/* - Copyright (C) 2014-2015 Genome Research Ltd. +/* + Copyright (C) 2014-2025 Genome Research Ltd. Author: Petr Danecek @@ -9,10 +9,10 @@ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - + The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE @@ -81,8 +81,8 @@ void ploidy_destroy(ploidy_t *ploidy); * @param seq: chromosome name * @param pos: 0-based position * @param sex2ploidy: if not NULL, array will be filled with mapping from sex id to ploidy - * @param min: if not NULL, minimum encountered encountered will be set - * @param max: if not NULL, maximum encountered encountered will be set + * @param min: if not NULL, minimum encountered ploidy will be set + * @param max: if not NULL, maximum encountered ploidy will be set * * Returns 1 if the position is listed in the regions or 0 otherwise. */ @@ -125,5 +125,8 @@ regidx_t *ploidy_regions(ploidy_t *ploidy); int ploidy_max(ploidy_t *ploidy); int ploidy_min(ploidy_t *ploidy); +/** Create a parseable ploidy file for debugging. The string must be free()-ed by the caller */ +char *ploidy_format(ploidy_t *ploidy); + #endif diff --git a/bcftools/read_consensus.c b/bcftools/read_consensus.c index 593b19b5..f66cc7dc 100644 --- a/bcftools/read_consensus.c +++ b/bcftools/read_consensus.c @@ -521,7 +521,7 @@ static int create_haplotype_frequency_spectrum(read_cns_t *rcns) } else if ( cvar->vtype==ins ) { - int len; + int len = 0; ins_freq_t *ifrq = &rcns->ins_freq[cvar->pos - rcns->beg]; int iseq = cstate_seek_op_fwd(&cigar, cvar->pos+1, BAM_CINS, &len); if ( iseq==-2 ) break; @@ -533,7 +533,7 @@ static int create_haplotype_frequency_spectrum(read_cns_t *rcns) } else if ( cvar->vtype==del ) { - int len; + int len = 0; del_freq_t *dfrq = &rcns->del_freq[cvar->pos - rcns->beg]; int ret = cstate_seek_op_fwd(&cigar, cvar->pos+1, BAM_CDEL, &len); if ( ret==-2 ) break; diff --git a/bcftools/read_consensus.c.pysam.c b/bcftools/read_consensus.c.pysam.c index ef2ff089..802ce8ed 100644 --- a/bcftools/read_consensus.c.pysam.c +++ b/bcftools/read_consensus.c.pysam.c @@ -523,7 +523,7 @@ static int create_haplotype_frequency_spectrum(read_cns_t *rcns) } else if ( cvar->vtype==ins ) { - int len; + int len = 0; ins_freq_t *ifrq = &rcns->ins_freq[cvar->pos - rcns->beg]; int iseq = cstate_seek_op_fwd(&cigar, cvar->pos+1, BAM_CINS, &len); if ( iseq==-2 ) break; @@ -535,7 +535,7 @@ static int create_haplotype_frequency_spectrum(read_cns_t *rcns) } else if ( cvar->vtype==del ) { - int len; + int len = 0; del_freq_t *dfrq = &rcns->del_freq[cvar->pos - rcns->beg]; int ret = cstate_seek_op_fwd(&cigar, cvar->pos+1, BAM_CDEL, &len); if ( ret==-2 ) break; diff --git a/bcftools/regidx.c b/bcftools/regidx.c index cdaf7eaf..445d7d58 100644 --- a/bcftools/regidx.c +++ b/bcftools/regidx.c @@ -1,5 +1,5 @@ -/* - Copyright (C) 2014-2018 Genome Research Ltd. +/* + Copyright (C) 2014-2025 Genome Research Ltd. Author: Petr Danecek @@ -9,10 +9,10 @@ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - + The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE @@ -65,12 +65,13 @@ struct _reglist_t void *dat; // payload data char *seq; // sequence name int unsorted; - + int merged; }; // Container of all sequences struct _regidx_t { + int merge_overlaps; int nseq, mseq; // n:used, m:alloced reglist_t *seq; // regions for each sequence void *seq2regs; // hash for fast lookup from chr name to regions @@ -147,6 +148,11 @@ inline int regidx_push(regidx_t *idx, char *chr_beg, char *chr_end, uint32_t beg if ( beg > MAX_COOR_0 ) beg = MAX_COOR_0; if ( end > MAX_COOR_0 ) end = MAX_COOR_0; + if ( beg > end ) + { + uint32_t tmp = beg; beg = end; end = tmp; + } + int rid; idx->str.l = 0; kputsn(chr_beg, chr_end-chr_beg+1, &idx->str); @@ -218,6 +224,24 @@ regidx_t *regidx_init_string(const char *str, regidx_parse_f parser, regidx_free return idx; } +int regidx_set(regidx_t *idx, regidx_opt_t key, ...) +{ + va_list args; + switch (key) + { + case merge_overlaps: + va_start(args, key); + idx->merge_overlaps = va_arg(args,int); + va_end(args); + return 0; + default: + hts_log_error("Todo: regidx_set key=%d",(int)key); + return -1; + break; + } + return 0; +} + regidx_t *regidx_init(const char *fname, regidx_parse_f parser, regidx_free_f free_f, size_t payload_size, void *usr_dat) { if ( !parser ) @@ -250,7 +274,7 @@ regidx_t *regidx_init(const char *fname, regidx_parse_f parser, regidx_free_f fr if ( payload_size ) idx->payload = malloc(payload_size); if ( !fname ) return idx; - + kstring_t str = {0,0,0}; htsFile *fp = hts_open(fname,"r"); @@ -299,53 +323,72 @@ void regidx_destroy(regidx_t *idx) free(idx); } -int _reglist_build_index(regidx_t *regidx, reglist_t *list) +static void reglist_sort_(regidx_t *regidx, reglist_t *list) { - int i; - if ( list->unsorted ) + if ( !list->unsorted ) return; + + if ( !regidx->payload_size ) + qsort(list->reg,list->nreg,sizeof(reg_t),cmp_reg_ptrs); + else { - if ( !regidx->payload_size ) - qsort(list->reg,list->nreg,sizeof(reg_t),cmp_reg_ptrs); - else + int i; + reg_t **ptr = (reg_t**) malloc(sizeof(reg_t*)*list->nreg); + for (i=0; inreg; i++) ptr[i] = list->reg + i; + qsort(ptr,list->nreg,sizeof(*ptr),cmp_reg_ptrs2); + + void *tmp_dat = malloc(regidx->payload_size*list->nreg); + for (i=0; inreg; i++) { - reg_t **ptr = (reg_t**) malloc(sizeof(reg_t*)*list->nreg); - for (i=0; inreg; i++) ptr[i] = list->reg + i; - qsort(ptr,list->nreg,sizeof(*ptr),cmp_reg_ptrs2); - - void *tmp_dat = malloc(regidx->payload_size*list->nreg); - for (i=0; inreg; i++) - { - size_t iori = ptr[i] - list->reg; - memcpy((char *)tmp_dat+i*regidx->payload_size, - (char *)list->dat+iori*regidx->payload_size, - regidx->payload_size); - } - free(list->dat); - list->dat = tmp_dat; - - reg_t *tmp_reg = (reg_t*) malloc(sizeof(reg_t)*list->nreg); - for (i=0; inreg; i++) - { - size_t iori = ptr[i] - list->reg; - tmp_reg[i] = list->reg[iori]; - } - free(ptr); - free(list->reg); - list->reg = tmp_reg; - list->mreg = list->nreg; + size_t iori = ptr[i] - list->reg; + memcpy((char *)tmp_dat+i*regidx->payload_size, + (char *)list->dat+iori*regidx->payload_size, + regidx->payload_size); } - list->unsorted = 0; + free(list->dat); + list->dat = tmp_dat; + + reg_t *tmp_reg = (reg_t*) malloc(sizeof(reg_t)*list->nreg); + for (i=0; inreg; i++) + { + size_t iori = ptr[i] - list->reg; + tmp_reg[i] = list->reg[iori]; + } + free(ptr); + free(list->reg); + list->reg = tmp_reg; + list->mreg = list->nreg; } + list->unsorted = 0; +} +static void reglist_merge_(regidx_t *regidx, reglist_t *list) +{ + if ( list->merged ) return; + int j; + for (j=1; jnreg; j++) + { + if ( list->reg[j-1].end < list->reg[j].beg ) continue; + if ( list->reg[j-1].end < list->reg[j].end ) list->reg[j-1].end = list->reg[j].end; + if ( j+1 < list->nreg ) memmove(&list->reg[j],&list->reg[j+1],(list->nreg-j-1)*sizeof(*list->reg)); + j--; + list->nreg--; + } + list->merged = 1; +} + +int _reglist_build_index(regidx_t *regidx, reglist_t *list) +{ + reglist_sort_(regidx,list); + if ( regidx->merge_overlaps ) reglist_merge_(regidx,list); list->nidx = 0; - int j,k, midx = 0; + int j, k, midx = 0; for (j=0; jnreg; j++) { int ibeg = iBIN(list->reg[j].beg); int iend = iBIN(list->reg[j].end); if ( midx <= iend ) { - int old_midx = midx; + int old_midx = midx; midx = iend + 1; kroundup32(midx); list->idx = (uint32_t*) realloc(list->idx, midx*sizeof(uint32_t)); @@ -436,7 +479,7 @@ int regidx_parse_bed(const char *line, char **chr_beg, char **chr_end, uint32_t while ( *ss && isspace(*ss) ) ss++; if ( !*ss ) return -1; // skip blank lines if ( *ss=='#' ) return -1; // skip comments - + char *se = ss; while ( *se && !isspace(*se) ) se++; @@ -458,7 +501,7 @@ int regidx_parse_bed(const char *line, char **chr_beg, char **chr_end, uint32_t ss = se+1; *end = strtod(ss, &se) - 1; if ( ss==se ) { fprintf(stderr,"Could not parse bed line: %s\n", line); return -2; } - + return 0; } @@ -468,7 +511,7 @@ int regidx_parse_tab(const char *line, char **chr_beg, char **chr_end, uint32_t while ( *ss && isspace(*ss) ) ss++; if ( !*ss ) return -1; // skip blank lines if ( *ss=='#' ) return -1; // skip comments - + char *se = ss; while ( *se && !isspace(*se) ) se++; @@ -515,7 +558,7 @@ int regidx_parse_reg(const char *line, char **chr_beg, char **chr_end, uint32_t while ( *ss && isspace(*ss) ) ss++; if ( !*ss ) return -1; // skip blank lines if ( *ss=='#' ) return -1; // skip comments - + char *se = ss; while ( *se && *se!=':' ) se++; @@ -627,6 +670,12 @@ int regitr_loop(regitr_t *regitr) itr->list = ®idx->seq[iseq]; } + if ( regidx->merge_overlaps ) + { + reglist_sort_(regidx,itr->list); + reglist_merge_(regidx,itr->list); + } + regitr->seq = itr->list->seq; regitr->beg = itr->list->reg[itr->ireg].beg; regitr->end = itr->list->reg[itr->ireg].end; diff --git a/bcftools/regidx.c.pysam.c b/bcftools/regidx.c.pysam.c index 4eb96e87..23df04c0 100644 --- a/bcftools/regidx.c.pysam.c +++ b/bcftools/regidx.c.pysam.c @@ -1,7 +1,7 @@ #include "bcftools.pysam.h" -/* - Copyright (C) 2014-2018 Genome Research Ltd. +/* + Copyright (C) 2014-2025 Genome Research Ltd. Author: Petr Danecek @@ -11,10 +11,10 @@ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - + The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE @@ -67,12 +67,13 @@ struct _reglist_t void *dat; // payload data char *seq; // sequence name int unsorted; - + int merged; }; // Container of all sequences struct _regidx_t { + int merge_overlaps; int nseq, mseq; // n:used, m:alloced reglist_t *seq; // regions for each sequence void *seq2regs; // hash for fast lookup from chr name to regions @@ -149,6 +150,11 @@ inline int regidx_push(regidx_t *idx, char *chr_beg, char *chr_end, uint32_t beg if ( beg > MAX_COOR_0 ) beg = MAX_COOR_0; if ( end > MAX_COOR_0 ) end = MAX_COOR_0; + if ( beg > end ) + { + uint32_t tmp = beg; beg = end; end = tmp; + } + int rid; idx->str.l = 0; kputsn(chr_beg, chr_end-chr_beg+1, &idx->str); @@ -220,6 +226,24 @@ regidx_t *regidx_init_string(const char *str, regidx_parse_f parser, regidx_free return idx; } +int regidx_set(regidx_t *idx, regidx_opt_t key, ...) +{ + va_list args; + switch (key) + { + case merge_overlaps: + va_start(args, key); + idx->merge_overlaps = va_arg(args,int); + va_end(args); + return 0; + default: + hts_log_error("Todo: regidx_set key=%d",(int)key); + return -1; + break; + } + return 0; +} + regidx_t *regidx_init(const char *fname, regidx_parse_f parser, regidx_free_f free_f, size_t payload_size, void *usr_dat) { if ( !parser ) @@ -252,7 +276,7 @@ regidx_t *regidx_init(const char *fname, regidx_parse_f parser, regidx_free_f fr if ( payload_size ) idx->payload = malloc(payload_size); if ( !fname ) return idx; - + kstring_t str = {0,0,0}; htsFile *fp = hts_open(fname,"r"); @@ -301,53 +325,72 @@ void regidx_destroy(regidx_t *idx) free(idx); } -int _reglist_build_index(regidx_t *regidx, reglist_t *list) +static void reglist_sort_(regidx_t *regidx, reglist_t *list) { - int i; - if ( list->unsorted ) + if ( !list->unsorted ) return; + + if ( !regidx->payload_size ) + qsort(list->reg,list->nreg,sizeof(reg_t),cmp_reg_ptrs); + else { - if ( !regidx->payload_size ) - qsort(list->reg,list->nreg,sizeof(reg_t),cmp_reg_ptrs); - else + int i; + reg_t **ptr = (reg_t**) malloc(sizeof(reg_t*)*list->nreg); + for (i=0; inreg; i++) ptr[i] = list->reg + i; + qsort(ptr,list->nreg,sizeof(*ptr),cmp_reg_ptrs2); + + void *tmp_dat = malloc(regidx->payload_size*list->nreg); + for (i=0; inreg; i++) { - reg_t **ptr = (reg_t**) malloc(sizeof(reg_t*)*list->nreg); - for (i=0; inreg; i++) ptr[i] = list->reg + i; - qsort(ptr,list->nreg,sizeof(*ptr),cmp_reg_ptrs2); - - void *tmp_dat = malloc(regidx->payload_size*list->nreg); - for (i=0; inreg; i++) - { - size_t iori = ptr[i] - list->reg; - memcpy((char *)tmp_dat+i*regidx->payload_size, - (char *)list->dat+iori*regidx->payload_size, - regidx->payload_size); - } - free(list->dat); - list->dat = tmp_dat; - - reg_t *tmp_reg = (reg_t*) malloc(sizeof(reg_t)*list->nreg); - for (i=0; inreg; i++) - { - size_t iori = ptr[i] - list->reg; - tmp_reg[i] = list->reg[iori]; - } - free(ptr); - free(list->reg); - list->reg = tmp_reg; - list->mreg = list->nreg; + size_t iori = ptr[i] - list->reg; + memcpy((char *)tmp_dat+i*regidx->payload_size, + (char *)list->dat+iori*regidx->payload_size, + regidx->payload_size); } - list->unsorted = 0; + free(list->dat); + list->dat = tmp_dat; + + reg_t *tmp_reg = (reg_t*) malloc(sizeof(reg_t)*list->nreg); + for (i=0; inreg; i++) + { + size_t iori = ptr[i] - list->reg; + tmp_reg[i] = list->reg[iori]; + } + free(ptr); + free(list->reg); + list->reg = tmp_reg; + list->mreg = list->nreg; } + list->unsorted = 0; +} +static void reglist_merge_(regidx_t *regidx, reglist_t *list) +{ + if ( list->merged ) return; + int j; + for (j=1; jnreg; j++) + { + if ( list->reg[j-1].end < list->reg[j].beg ) continue; + if ( list->reg[j-1].end < list->reg[j].end ) list->reg[j-1].end = list->reg[j].end; + if ( j+1 < list->nreg ) memmove(&list->reg[j],&list->reg[j+1],(list->nreg-j-1)*sizeof(*list->reg)); + j--; + list->nreg--; + } + list->merged = 1; +} + +int _reglist_build_index(regidx_t *regidx, reglist_t *list) +{ + reglist_sort_(regidx,list); + if ( regidx->merge_overlaps ) reglist_merge_(regidx,list); list->nidx = 0; - int j,k, midx = 0; + int j, k, midx = 0; for (j=0; jnreg; j++) { int ibeg = iBIN(list->reg[j].beg); int iend = iBIN(list->reg[j].end); if ( midx <= iend ) { - int old_midx = midx; + int old_midx = midx; midx = iend + 1; kroundup32(midx); list->idx = (uint32_t*) realloc(list->idx, midx*sizeof(uint32_t)); @@ -438,7 +481,7 @@ int regidx_parse_bed(const char *line, char **chr_beg, char **chr_end, uint32_t while ( *ss && isspace(*ss) ) ss++; if ( !*ss ) return -1; // skip blank lines if ( *ss=='#' ) return -1; // skip comments - + char *se = ss; while ( *se && !isspace(*se) ) se++; @@ -460,7 +503,7 @@ int regidx_parse_bed(const char *line, char **chr_beg, char **chr_end, uint32_t ss = se+1; *end = strtod(ss, &se) - 1; if ( ss==se ) { fprintf(bcftools_stderr,"Could not parse bed line: %s\n", line); return -2; } - + return 0; } @@ -470,7 +513,7 @@ int regidx_parse_tab(const char *line, char **chr_beg, char **chr_end, uint32_t while ( *ss && isspace(*ss) ) ss++; if ( !*ss ) return -1; // skip blank lines if ( *ss=='#' ) return -1; // skip comments - + char *se = ss; while ( *se && !isspace(*se) ) se++; @@ -517,7 +560,7 @@ int regidx_parse_reg(const char *line, char **chr_beg, char **chr_end, uint32_t while ( *ss && isspace(*ss) ) ss++; if ( !*ss ) return -1; // skip blank lines if ( *ss=='#' ) return -1; // skip comments - + char *se = ss; while ( *se && *se!=':' ) se++; @@ -629,6 +672,12 @@ int regitr_loop(regitr_t *regitr) itr->list = ®idx->seq[iseq]; } + if ( regidx->merge_overlaps ) + { + reglist_sort_(regidx,itr->list); + reglist_merge_(regidx,itr->list); + } + regitr->seq = itr->list->seq; regitr->beg = itr->list->reg[itr->ireg].beg; regitr->end = itr->list->reg[itr->ireg].end; diff --git a/bcftools/regidx.h b/bcftools/regidx.h index 09c43f89..05167bbf 100644 --- a/bcftools/regidx.h +++ b/bcftools/regidx.h @@ -1,5 +1,5 @@ -/* - Copyright (C) 2014-2016, 2018 Genome Research Ltd. +/* + Copyright (C) 2014-2025 Genome Research Ltd. Author: Petr Danecek @@ -9,10 +9,10 @@ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - + The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE @@ -39,7 +39,7 @@ while ( regitr_overlap(itr) ) { - printf("[%d,%d] overlaps with [%d,%d], payload=%s\n", beg,end, + printf("[%d,%d] overlaps with [%d,%d], payload=%s\n", beg,end, itr->beg+1, itr->end+1, regitr_payload(itr,char*)); } @@ -48,7 +48,7 @@ Another example, loop over all regions: - + regidx_t *idx = regidx_init(in_fname,NULL,NULL,0,NULL); regitr_t *itr = regitr_init(idx); @@ -105,6 +105,15 @@ regitr_t; #define regitr_payload(itr,type_t) (*((type_t*)(itr)->payload)) +typedef enum +{ + merge_overlaps, // merge overlapping regions +} +regidx_opt_t; + +int regidx_set(regidx_t *idx, regidx_opt_t key, ...); // returns 0 on success + + /* * regidx_parse_f - Function to parse one input line, such as regidx_parse_bed * or regidx_parse_tab below. The function is expected to set `chr_from` and @@ -121,7 +130,7 @@ typedef int (*regidx_parse_f)(const char *line, char **chr_beg, char **chr_end, typedef void (*regidx_free_f)(void *payload); /* - * A note about the parsers: + * A note about the parsers: * - leading spaces are ignored * - lines starting with "#" are ignored */ @@ -164,7 +173,7 @@ void regidx_destroy(regidx_t *idx); int regidx_overlap(regidx_t *idx, const char *chr, uint32_t beg, uint32_t end, regitr_t *itr); /* - * regidx_insert() - add a new region. + * regidx_insert() - add a new region. * regidx_insert_list() - add new regions from a list * regidx_push() - low level insertion of a new region * diff --git a/bcftools/reheader.c b/bcftools/reheader.c index 37e5d965..1d4e85a2 100644 --- a/bcftools/reheader.c +++ b/bcftools/reheader.c @@ -1,6 +1,6 @@ /* reheader.c -- reheader subcommand. - Copyright (C) 2014-2022,2024 Genome Research Ltd. + Copyright (C) 2014-2025 Genome Research Ltd. Author: Petr Danecek @@ -418,7 +418,7 @@ static void reheader_vcf_gz(args_t *args) // Output all remaining data read with the header block if ( fp->block_length - skip_until > 0 ) { - if ( bgzf_write(bgzf_out, buffer+skip_until, fp->block_length-skip_until)<0 ) error("Error: %d\n",fp->errcode); + if ( bgzf_write(bgzf_out, buffer+skip_until, fp->block_length-skip_until)<0 ) error("Error: %d\n",bgzf_out->errcode); } if ( bgzf_flush(bgzf_out)<0 ) error("Error: %d\n",bgzf_out->errcode); @@ -434,8 +434,8 @@ static void reheader_vcf_gz(args_t *args) int count = bgzf_raw_write(bgzf_out, buf, nread); if (count != nread) error("Write failed, wrote %d instead of %d bytes.\n", count,(int)nread); } - if (bgzf_close(bgzf_out) < 0) error("Error closing %s: %d\n",args->output_fname ? args->output_fname : "-",bgzf_out->errcode); - if (hts_close(args->fp)) error("Error closing %s: %d\n",args->fname,fp->errcode); + if (bgzf_close(bgzf_out) < 0) error("Error closing %s: %s\n",args->output_fname ? args->output_fname : "-",strerror(errno)); + if (hts_close(args->fp)) error("Error closing %s: %s\n",args->fname,strerror(errno)); free(buf); } static void reheader_vcf(args_t *args) @@ -661,12 +661,13 @@ static void usage(args_t *args) fprintf(stderr, "Usage: bcftools reheader [OPTIONS] \n"); fprintf(stderr, "\n"); fprintf(stderr, "Options:\n"); - fprintf(stderr, " -f, --fai FILE update sequences and their lengths from the .fai file\n"); - fprintf(stderr, " -h, --header FILE new header\n"); - fprintf(stderr, " -o, --output FILE write output to a file [standard output]\n"); - fprintf(stderr, " -s, --samples FILE new sample names\n"); - fprintf(stderr, " -T, --temp-prefix PATH ignored; was template for temporary file name\n"); - fprintf(stderr, " --threads INT use multithreading with worker threads (BCF only) [0]\n"); + fprintf(stderr, " -f, --fai FILE Update sequences and their lengths from the .fai file\n"); + fprintf(stderr, " -h, --header FILE New header\n"); + fprintf(stderr, " -o, --output FILE Write output to a file [standard output]\n"); + fprintf(stderr, " -s, --samples FILE New sample names\n"); + fprintf(stderr, " -T, --temp-prefix PATH Ignored; was template for temporary file name\n"); + fprintf(stderr, " --threads INT Use multithreading with worker threads (BCF only) [0]\n"); + fprintf(stderr, " -v, --verbosity INT Verbosity level\n"); fprintf(stderr, "\n"); fprintf(stderr, "Example:\n"); fprintf(stderr, " # Write out the header to be modified\n"); @@ -695,12 +696,16 @@ int main_reheader(int argc, char *argv[]) {"header",1,0,'h'}, {"samples",1,0,'s'}, {"threads",1,NULL,1}, + {"verbosity",required_argument,NULL,'v'}, {0,0,0,0} }; - while ((c = getopt_long(argc, argv, "s:h:o:f:T:",loptions,NULL)) >= 0) + while ((c = getopt_long(argc, argv, "s:h:o:f:T:v:",loptions,NULL)) >= 0) { switch (c) { + case 'v': + if ( apply_verbosity(optarg) < 0 ) error("Could not parse argument: --verbosity %s\n", optarg); + break; case 1 : args->n_threads = strtol(optarg, 0, 0); break; case 'T': break; // unused - was temp file prefix case 'f': args->fai_fname = optarg; break; diff --git a/bcftools/reheader.c.pysam.c b/bcftools/reheader.c.pysam.c index 87d460a8..8e81a688 100644 --- a/bcftools/reheader.c.pysam.c +++ b/bcftools/reheader.c.pysam.c @@ -2,7 +2,7 @@ /* reheader.c -- reheader subcommand. - Copyright (C) 2014-2022,2024 Genome Research Ltd. + Copyright (C) 2014-2025 Genome Research Ltd. Author: Petr Danecek @@ -420,7 +420,7 @@ static void reheader_vcf_gz(args_t *args) // Output all remaining data read with the header block if ( fp->block_length - skip_until > 0 ) { - if ( bgzf_write(bgzf_out, buffer+skip_until, fp->block_length-skip_until)<0 ) error("Error: %d\n",fp->errcode); + if ( bgzf_write(bgzf_out, buffer+skip_until, fp->block_length-skip_until)<0 ) error("Error: %d\n",bgzf_out->errcode); } if ( bgzf_flush(bgzf_out)<0 ) error("Error: %d\n",bgzf_out->errcode); @@ -436,8 +436,8 @@ static void reheader_vcf_gz(args_t *args) int count = bgzf_raw_write(bgzf_out, buf, nread); if (count != nread) error("Write failed, wrote %d instead of %d bytes.\n", count,(int)nread); } - if (bgzf_close(bgzf_out) < 0) error("Error closing %s: %d\n",args->output_fname ? args->output_fname : "-",bgzf_out->errcode); - if (hts_close(args->fp)) error("Error closing %s: %d\n",args->fname,fp->errcode); + if (bgzf_close(bgzf_out) < 0) error("Error closing %s: %s\n",args->output_fname ? args->output_fname : "-",strerror(errno)); + if (hts_close(args->fp)) error("Error closing %s: %s\n",args->fname,strerror(errno)); free(buf); } static void reheader_vcf(args_t *args) @@ -663,12 +663,13 @@ static void usage(args_t *args) fprintf(bcftools_stderr, "Usage: bcftools reheader [OPTIONS] \n"); fprintf(bcftools_stderr, "\n"); fprintf(bcftools_stderr, "Options:\n"); - fprintf(bcftools_stderr, " -f, --fai FILE update sequences and their lengths from the .fai file\n"); - fprintf(bcftools_stderr, " -h, --header FILE new header\n"); - fprintf(bcftools_stderr, " -o, --output FILE write output to a file [standard output]\n"); - fprintf(bcftools_stderr, " -s, --samples FILE new sample names\n"); - fprintf(bcftools_stderr, " -T, --temp-prefix PATH ignored; was template for temporary file name\n"); - fprintf(bcftools_stderr, " --threads INT use multithreading with worker threads (BCF only) [0]\n"); + fprintf(bcftools_stderr, " -f, --fai FILE Update sequences and their lengths from the .fai file\n"); + fprintf(bcftools_stderr, " -h, --header FILE New header\n"); + fprintf(bcftools_stderr, " -o, --output FILE Write output to a file [standard output]\n"); + fprintf(bcftools_stderr, " -s, --samples FILE New sample names\n"); + fprintf(bcftools_stderr, " -T, --temp-prefix PATH Ignored; was template for temporary file name\n"); + fprintf(bcftools_stderr, " --threads INT Use multithreading with worker threads (BCF only) [0]\n"); + fprintf(bcftools_stderr, " -v, --verbosity INT Verbosity level\n"); fprintf(bcftools_stderr, "\n"); fprintf(bcftools_stderr, "Example:\n"); fprintf(bcftools_stderr, " # Write out the header to be modified\n"); @@ -697,12 +698,16 @@ int main_reheader(int argc, char *argv[]) {"header",1,0,'h'}, {"samples",1,0,'s'}, {"threads",1,NULL,1}, + {"verbosity",required_argument,NULL,'v'}, {0,0,0,0} }; - while ((c = getopt_long(argc, argv, "s:h:o:f:T:",loptions,NULL)) >= 0) + while ((c = getopt_long(argc, argv, "s:h:o:f:T:v:",loptions,NULL)) >= 0) { switch (c) { + case 'v': + if ( apply_verbosity(optarg) < 0 ) error("Could not parse argument: --verbosity %s\n", optarg); + break; case 1 : args->n_threads = strtol(optarg, 0, 0); break; case 'T': break; // unused - was temp file prefix case 'f': args->fai_fname = optarg; break; diff --git a/bcftools/smpl_ilist.c b/bcftools/smpl_ilist.c index e3fbaccf..4bc4cec2 100644 --- a/bcftools/smpl_ilist.c +++ b/bcftools/smpl_ilist.c @@ -1,4 +1,4 @@ -/* +/* Copyright (C) 2016-2021 Genome Research Ltd. Author: Petr Danecek @@ -9,10 +9,10 @@ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - + The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE @@ -162,7 +162,7 @@ smpl_ilist_t *smpl_ilist_map(bcf_hdr_t *hdr_a, bcf_hdr_t *hdr_b, int flags) { const char *name = bcf_hdr_int2id(hdr_a, BCF_DT_SAMPLE, i); smpl->idx[i] = bcf_hdr_id2int(hdr_b, BCF_DT_SAMPLE, name); - if ( flags&SMPL_STRICT && smpl->idx[i]<0 ) + if ( flags&SMPL_STRICT && smpl->idx[i]<0 ) error("The sample %s is not present in the second file\n", name); } return smpl; diff --git a/bcftools/smpl_ilist.c.pysam.c b/bcftools/smpl_ilist.c.pysam.c index 68ed5279..e251b5ea 100644 --- a/bcftools/smpl_ilist.c.pysam.c +++ b/bcftools/smpl_ilist.c.pysam.c @@ -1,6 +1,6 @@ #include "bcftools.pysam.h" -/* +/* Copyright (C) 2016-2021 Genome Research Ltd. Author: Petr Danecek @@ -11,10 +11,10 @@ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - + The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE @@ -164,7 +164,7 @@ smpl_ilist_t *smpl_ilist_map(bcf_hdr_t *hdr_a, bcf_hdr_t *hdr_b, int flags) { const char *name = bcf_hdr_int2id(hdr_a, BCF_DT_SAMPLE, i); smpl->idx[i] = bcf_hdr_id2int(hdr_b, BCF_DT_SAMPLE, name); - if ( flags&SMPL_STRICT && smpl->idx[i]<0 ) + if ( flags&SMPL_STRICT && smpl->idx[i]<0 ) error("The sample %s is not present in the second file\n", name); } return smpl; diff --git a/bcftools/vcfannotate.c b/bcftools/vcfannotate.c index b66c8cf5..238c2e98 100644 --- a/bcftools/vcfannotate.c +++ b/bcftools/vcfannotate.c @@ -1,6 +1,6 @@ /* vcfannotate.c -- Annotate and edit VCF/BCF files. - Copyright (C) 2013-2024 Genome Research Ltd. + Copyright (C) 2013-2025 Genome Research Ltd. Author: Petr Danecek @@ -774,6 +774,7 @@ static int vcf_setter_info_flag(args_t *args, bcf1_t *line, annot_col_t *col, vo } static int setter_ARinfo_int32(args_t *args, bcf1_t *line, annot_col_t *col, int nals, char **als, int ntmpi) { + if ( !nals ) error("Cannot fill Number=R,A tags without --columns ..,REF,ALT,..\n"); if ( col->number==BCF_VL_A && ntmpi!=nals-1 && (ntmpi!=1 || args->tmpi[0]!=bcf_int32_missing || args->tmpi[1]!=bcf_int32_vector_end) ) error("Incorrect number of values (%d) for the %s tag at %s:%"PRId64"\n", ntmpi,col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1); else if ( col->number==BCF_VL_R && ntmpi!=nals && (ntmpi!=1 || args->tmpi[0]!=bcf_int32_missing || args->tmpi[1]!=bcf_int32_vector_end) ) @@ -855,8 +856,8 @@ static int setter_info_int(args_t *args, bcf1_t *line, annot_col_t *col, void *d else { args->tmpi[ntmpi-1] = strtol(str, &end, 10); - if ( end==str ) - error("Could not parse %s at %s:%"PRId64" .. [%s]\n", col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1,tab->cols[col->icol]); + if ( end==str || (*end && *end!=',') ) + error("Could not parse %s (Type=Integer) at %s:%"PRId64" .. [%s]\n", col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1,tab->cols[col->icol]); str = end+1; } } @@ -938,6 +939,7 @@ static int vcf_setter_info_int(args_t *args, bcf1_t *line, annot_col_t *col, voi } static int setter_ARinfo_real(args_t *args, bcf1_t *line, annot_col_t *col, int nals, char **als, int ntmpf) { + if ( !nals ) error("Cannot fill Number=R,A tags without --columns ..,REF,ALT,..\n"); if ( col->number==BCF_VL_A && ntmpf!=nals-1 && (ntmpf!=1 || !bcf_float_is_missing(args->tmpf[0]) || !bcf_float_is_vector_end(args->tmpf[0])) ) error("Incorrect number of values (%d) for the %s tag at %s:%"PRId64"\n", ntmpf,col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1); else if ( col->number==BCF_VL_R && ntmpf!=nals && (ntmpf!=1 || !bcf_float_is_missing(args->tmpf[0]) || !bcf_float_is_vector_end(args->tmpf[0])) ) @@ -1124,6 +1126,7 @@ int copy_string_field(char *src, int isrc, int src_len, kstring_t *dst, int idst static int setter_ARinfo_string(args_t *args, bcf1_t *line, annot_col_t *col, int nals, char **als) { assert( col->merge_method==MM_FIRST ); + if ( !nals ) error("Cannot fill Number=R,A tags without --columns ..,REF,ALT,..\n"); int nsrc = 1, lsrc = 0; while ( args->tmps[lsrc] ) @@ -1668,8 +1671,8 @@ static int setter_format_int(args_t *args, bcf1_t *line, annot_col_t *col, void char *end = str; ptr[ival] = strtol(str, &end, 10); - if ( end==str ) - error("Could not parse %s at %s:%"PRId64" .. [%s]\n", col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1,tab->cols[col->icol]); + if ( end==str || (*end && *end!=',') ) + error("Could not parse %s (Type=Integer) at %s:%"PRId64" .. [%s]\n", col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1,tab->cols[col->icol]); ival++; str = *end ? end+1 : end; @@ -2313,7 +2316,7 @@ static void init_columns(args_t *args) col->hdr_key_src = strdup(str.s); col->hdr_key_dst = strdup(str.s); col->replace = replace; - if ( args->pair_logic==-1 ) bcf_sr_set_opt(args->files,BCF_SR_PAIR_LOGIC,BCF_SR_PAIR_BOTH_REF); + if ( args->pair_logic==-1 ) args->pair_logic = BCF_SR_PAIR_ANY; } else args->alt_idx = icol; } @@ -2321,7 +2324,6 @@ static void init_columns(args_t *args) { if ( replace & REPLACE_NON_MISSING ) error("Apologies, the -ID feature has not been implemented yet.\n"); if ( str.s[0]=='~' ) replace = MATCH_VALUE; - if ( args->tgts_is_vcf && (replace & MATCH_VALUE) ) error("todo: -c ~ID with -a VCF?\n"); args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols); annot_col_t *col = &args->cols[args->ncols-1]; memset(col,0,sizeof(*col)); @@ -2330,7 +2332,11 @@ static void init_columns(args_t *args) col->setter = args->tgts_is_vcf ? vcf_setter_id : setter_id; col->hdr_key_src = strdup(str.s); col->hdr_key_dst = strdup(str.s); - if ( replace & MATCH_VALUE ) args->match_id = icol; + if ( replace & MATCH_VALUE ) + { + args->match_id = icol; + if ( args->tgts_is_vcf ) args->pair_logic = (args->pair_logic==-1) ? BCF_SR_PAIR_ID : args->pair_logic|BCF_SR_PAIR_ID; + } } else if ( !strcasecmp("~INFO/END",str.s) && !args->tgts_is_vcf ) { @@ -2408,8 +2414,8 @@ static void init_columns(args_t *args) col->hdr_key_src = strdup(ptr+2); col->hdr_key_dst = strdup(str.s+5); tmp.l = 0; - ksprintf(&tmp,"##INFO=",col->hdr_key_dst); - bcf_hdr_append(args->hdr_out, tmp.s); + ksprintf(&tmp,"##INFO=",col->hdr_key_dst); + if ( bcf_hdr_append(args->hdr_out, tmp.s) ) error("[%s:%d] failed to update the header\n",__FILE__,__LINE__); if (bcf_hdr_sync(args->hdr_out) < 0) error_errno("[%s] Failed to update header", __func__); int hdr_id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, col->hdr_key_dst); col->number = bcf_hdr_id2length(args->hdr_out,BCF_HL_INFO,hdr_id); @@ -2441,7 +2447,7 @@ static void init_columns(args_t *args) if ( k<0 ) error("[%s] Failed to parse the header, the ID attribute not found", __func__); tmp.l = 0; bcf_hrec_format(hrec, &tmp); - bcf_hdr_append(args->hdr_out, tmp.s); + if ( bcf_hdr_append(args->hdr_out, tmp.s) ) error("[%s:%d] failed to update the header\n",__FILE__,__LINE__); } if (bcf_hdr_sync(args->hdr_out) < 0) error_errno("[%s] Failed to update header", __func__); @@ -2475,7 +2481,7 @@ static void init_columns(args_t *args) if ( skip_info && khash_str2int_has_key(skip_info,hrec->vals[k]) ) continue; tmp.l = 0; bcf_hrec_format(hrec, &tmp); - bcf_hdr_append(args->hdr_out, tmp.s); + if ( bcf_hdr_append(args->hdr_out, tmp.s) ) error("[%s:%d] failed to update the header\n",__FILE__,__LINE__); if (bcf_hdr_sync(args->hdr_out) < 0) error_errno("[%s] Failed to update header", __func__); int hdr_id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, hrec->vals[k]); @@ -2511,7 +2517,7 @@ static void init_columns(args_t *args) if ( skip_fmt && khash_str2int_has_key(skip_fmt,hrec->vals[k]) ) continue; tmp.l = 0; bcf_hrec_format(hrec, &tmp); - bcf_hdr_append(args->hdr_out, tmp.s); + if ( bcf_hdr_append(args->hdr_out, tmp.s) ) error("[%s:%d] failed to update the header\n",__FILE__,__LINE__); if (bcf_hdr_sync(args->hdr_out) < 0) error_errno("[%s] Failed to update header", __func__); int hdr_id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, hrec->vals[k]); @@ -2559,7 +2565,7 @@ static void init_columns(args_t *args) if ( !hrec ) error("No such annotation \"%s\" in %s\n", key_src,args->targets_fname); tmp.l = 0; bcf_hrec_format_rename(hrec, key_dst, &tmp); - bcf_hdr_append(args->hdr_out, tmp.s); + if ( bcf_hdr_append(args->hdr_out, tmp.s) ) error("[%s:%d] failed to update the header\n",__FILE__,__LINE__); if (bcf_hdr_sync(args->hdr_out) < 0) error_errno("[%s] Failed to update header", __func__); } @@ -2666,13 +2672,13 @@ static void init_columns(args_t *args) { // transferring ID column into a new INFO tag tmp.l = 0; - ksprintf(&tmp,"##INFO=",key_dst); + ksprintf(&tmp,"##INFO=",key_dst); } else if ( !strcasecmp("FILTER",key_src) && !explicit_src_info ) { // transferring FILTER column into a new INFO tag tmp.l = 0; - ksprintf(&tmp,"##INFO=",key_dst); + ksprintf(&tmp,"##INFO=",key_dst); } else { @@ -2692,7 +2698,7 @@ static void init_columns(args_t *args) tmp.l = 0; bcf_hrec_format_rename(hrec, key_dst, &tmp); } - bcf_hdr_append(args->hdr_out, tmp.s); + if ( bcf_hdr_append(args->hdr_out, tmp.s) ) error("[%s:%d] failed to update the header\n",__FILE__,__LINE__); if (bcf_hdr_sync(args->hdr_out) < 0) error_errno("[%s] Failed to update header", __func__); hdr_id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, key_dst); @@ -3122,6 +3128,11 @@ static void init_data(args_t *args) &args->index_fn, args->write_index) < 0 ) error("Error: failed to initialise index for %s\n",args->output_fname); } + if ( args->tgts_is_vcf ) + { + if ( args->pair_logic==-1 ) args->pair_logic = BCF_SR_PAIR_SOME; + bcf_sr_set_opt(args->files,BCF_SR_PAIR_LOGIC,args->pair_logic); + } } static void destroy_data(args_t *args) @@ -3650,7 +3661,7 @@ static void usage(args_t *args) fprintf(stderr, " --no-version Do not append version and command line to the header\n"); fprintf(stderr, " -o, --output FILE Write output to a file [standard output]\n"); fprintf(stderr, " -O, --output-type u|b|v|z[0-9] u/b: un/compressed BCF, v/z: un/compressed VCF, 0-9: compression level [v]\n"); - fprintf(stderr, " --pair-logic STR Matching records by , see man page for details [some]\n"); + fprintf(stderr, " --pair-logic STR Matching records by , see man page for details [some]\n"); fprintf(stderr, " -r, --regions REGION Restrict to comma-separated list of regions\n"); fprintf(stderr, " -R, --regions-file FILE Restrict to regions listed in FILE\n"); fprintf(stderr, " --regions-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1]\n"); @@ -3661,6 +3672,7 @@ static void usage(args_t *args) fprintf(stderr, " --single-overlaps Keep memory low by avoiding complexities arising from handling multiple overlapping intervals\n"); fprintf(stderr, " -x, --remove LIST List of annotations (e.g. ID,INFO/DP,FORMAT/DP,FILTER) to remove (or keep with \"^\" prefix). See man page for details\n"); fprintf(stderr, " --threads INT Number of extra output compression threads [0]\n"); + fprintf(stderr, " -v, --verbosity INT Verbosity level\n"); fprintf(stderr, " -W, --write-index[=FMT] Automatically index the output files [off]\n"); fprintf(stderr, "\n"); fprintf(stderr, "Examples:\n"); @@ -3718,13 +3730,17 @@ int main_vcfannotate(int argc, char *argv[]) {"min-overlap",required_argument,NULL,12}, {"no-version",no_argument,NULL,8}, {"force",no_argument,NULL,'f'}, + {"verbosity",required_argument,NULL,'v'}, {"write-index",optional_argument,NULL,'W'}, {NULL,0,NULL,0} }; char *tmp; - while ((c = getopt_long(argc, argv, "h:H:?o:O:r:R:a:x:c:C:i:e:S:s:I:m:kl:fW::",loptions,NULL)) >= 0) + while ((c = getopt_long(argc, argv, "h:H:?o:O:r:R:a:x:c:C:i:e:S:s:I:m:kl:fW::v:",loptions,NULL)) >= 0) { switch (c) { + case 'v': + if ( apply_verbosity(optarg) < 0 ) error("Could not parse argument: --verbosity %s\n", optarg); + break; case 'f': args->force = 1; break; case 'k': args->keep_sites = 1; break; case 'm': @@ -3784,6 +3800,7 @@ int main_vcfannotate(int argc, char *argv[]) else if ( !strcmp(optarg,"some") ) args->pair_logic |= BCF_SR_PAIR_SOME; else if ( !strcmp(optarg,"none") ) args->pair_logic = BCF_SR_PAIR_EXACT; else if ( !strcmp(optarg,"exact") ) args->pair_logic = BCF_SR_PAIR_EXACT; + else if ( !strcmp(optarg,"id") ) args->pair_logic |= BCF_SR_PAIR_ID; else error("The --pair-logic string \"%s\" not recognised.\n", optarg); break; case 3 : @@ -3829,7 +3846,6 @@ int main_vcfannotate(int argc, char *argv[]) { args->tgts_is_vcf = 1; args->files->require_index = 1; - bcf_sr_set_opt(args->files,BCF_SR_PAIR_LOGIC,args->pair_logic>=0 ? args->pair_logic : BCF_SR_PAIR_SOME); if ( args->min_overlap_str ) error("The --min-overlap option cannot be used when annotating from a VCF\n"); } } @@ -3837,10 +3853,19 @@ int main_vcfannotate(int argc, char *argv[]) if ( bcf_sr_set_threads(args->files, args->n_threads)<0 ) error("Failed to create threads\n"); if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to read from %s: %s\n", !strcmp("-",fname)?"standard input":fname,bcf_sr_strerror(args->files->errnum)); - static int line_errcode_warned = 0; + static int line_errcode_warned = 0, vcf_parse_error_warned = 0; init_data(args); while ( bcf_sr_next_line(args->files) ) { + if ( args->files->errnum ) + { + if ( !args->force ) error("Error: %s\n", bcf_sr_strerror(args->files->errnum)); + else if ( !vcf_parse_error_warned ) + { + fprintf(stderr,"Warning: Encountered an error, proceeding only because --force was given.\n"); + vcf_parse_error_warned = 1; + } + } if ( !bcf_sr_has_line(args->files,0) ) continue; bcf1_t *line = bcf_sr_get_line(args->files,0); if ( line->errcode ) diff --git a/bcftools/vcfannotate.c.pysam.c b/bcftools/vcfannotate.c.pysam.c index 3d4d75ee..b4aeb56c 100644 --- a/bcftools/vcfannotate.c.pysam.c +++ b/bcftools/vcfannotate.c.pysam.c @@ -2,7 +2,7 @@ /* vcfannotate.c -- Annotate and edit VCF/BCF files. - Copyright (C) 2013-2024 Genome Research Ltd. + Copyright (C) 2013-2025 Genome Research Ltd. Author: Petr Danecek @@ -776,6 +776,7 @@ static int vcf_setter_info_flag(args_t *args, bcf1_t *line, annot_col_t *col, vo } static int setter_ARinfo_int32(args_t *args, bcf1_t *line, annot_col_t *col, int nals, char **als, int ntmpi) { + if ( !nals ) error("Cannot fill Number=R,A tags without --columns ..,REF,ALT,..\n"); if ( col->number==BCF_VL_A && ntmpi!=nals-1 && (ntmpi!=1 || args->tmpi[0]!=bcf_int32_missing || args->tmpi[1]!=bcf_int32_vector_end) ) error("Incorrect number of values (%d) for the %s tag at %s:%"PRId64"\n", ntmpi,col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1); else if ( col->number==BCF_VL_R && ntmpi!=nals && (ntmpi!=1 || args->tmpi[0]!=bcf_int32_missing || args->tmpi[1]!=bcf_int32_vector_end) ) @@ -857,8 +858,8 @@ static int setter_info_int(args_t *args, bcf1_t *line, annot_col_t *col, void *d else { args->tmpi[ntmpi-1] = strtol(str, &end, 10); - if ( end==str ) - error("Could not parse %s at %s:%"PRId64" .. [%s]\n", col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1,tab->cols[col->icol]); + if ( end==str || (*end && *end!=',') ) + error("Could not parse %s (Type=Integer) at %s:%"PRId64" .. [%s]\n", col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1,tab->cols[col->icol]); str = end+1; } } @@ -940,6 +941,7 @@ static int vcf_setter_info_int(args_t *args, bcf1_t *line, annot_col_t *col, voi } static int setter_ARinfo_real(args_t *args, bcf1_t *line, annot_col_t *col, int nals, char **als, int ntmpf) { + if ( !nals ) error("Cannot fill Number=R,A tags without --columns ..,REF,ALT,..\n"); if ( col->number==BCF_VL_A && ntmpf!=nals-1 && (ntmpf!=1 || !bcf_float_is_missing(args->tmpf[0]) || !bcf_float_is_vector_end(args->tmpf[0])) ) error("Incorrect number of values (%d) for the %s tag at %s:%"PRId64"\n", ntmpf,col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1); else if ( col->number==BCF_VL_R && ntmpf!=nals && (ntmpf!=1 || !bcf_float_is_missing(args->tmpf[0]) || !bcf_float_is_vector_end(args->tmpf[0])) ) @@ -1126,6 +1128,7 @@ int copy_string_field(char *src, int isrc, int src_len, kstring_t *dst, int idst static int setter_ARinfo_string(args_t *args, bcf1_t *line, annot_col_t *col, int nals, char **als) { assert( col->merge_method==MM_FIRST ); + if ( !nals ) error("Cannot fill Number=R,A tags without --columns ..,REF,ALT,..\n"); int nsrc = 1, lsrc = 0; while ( args->tmps[lsrc] ) @@ -1670,8 +1673,8 @@ static int setter_format_int(args_t *args, bcf1_t *line, annot_col_t *col, void char *end = str; ptr[ival] = strtol(str, &end, 10); - if ( end==str ) - error("Could not parse %s at %s:%"PRId64" .. [%s]\n", col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1,tab->cols[col->icol]); + if ( end==str || (*end && *end!=',') ) + error("Could not parse %s (Type=Integer) at %s:%"PRId64" .. [%s]\n", col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1,tab->cols[col->icol]); ival++; str = *end ? end+1 : end; @@ -2315,7 +2318,7 @@ static void init_columns(args_t *args) col->hdr_key_src = strdup(str.s); col->hdr_key_dst = strdup(str.s); col->replace = replace; - if ( args->pair_logic==-1 ) bcf_sr_set_opt(args->files,BCF_SR_PAIR_LOGIC,BCF_SR_PAIR_BOTH_REF); + if ( args->pair_logic==-1 ) args->pair_logic = BCF_SR_PAIR_ANY; } else args->alt_idx = icol; } @@ -2323,7 +2326,6 @@ static void init_columns(args_t *args) { if ( replace & REPLACE_NON_MISSING ) error("Apologies, the -ID feature has not been implemented yet.\n"); if ( str.s[0]=='~' ) replace = MATCH_VALUE; - if ( args->tgts_is_vcf && (replace & MATCH_VALUE) ) error("todo: -c ~ID with -a VCF?\n"); args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols); annot_col_t *col = &args->cols[args->ncols-1]; memset(col,0,sizeof(*col)); @@ -2332,7 +2334,11 @@ static void init_columns(args_t *args) col->setter = args->tgts_is_vcf ? vcf_setter_id : setter_id; col->hdr_key_src = strdup(str.s); col->hdr_key_dst = strdup(str.s); - if ( replace & MATCH_VALUE ) args->match_id = icol; + if ( replace & MATCH_VALUE ) + { + args->match_id = icol; + if ( args->tgts_is_vcf ) args->pair_logic = (args->pair_logic==-1) ? BCF_SR_PAIR_ID : args->pair_logic|BCF_SR_PAIR_ID; + } } else if ( !strcasecmp("~INFO/END",str.s) && !args->tgts_is_vcf ) { @@ -2410,8 +2416,8 @@ static void init_columns(args_t *args) col->hdr_key_src = strdup(ptr+2); col->hdr_key_dst = strdup(str.s+5); tmp.l = 0; - ksprintf(&tmp,"##INFO=",col->hdr_key_dst); - bcf_hdr_append(args->hdr_out, tmp.s); + ksprintf(&tmp,"##INFO=",col->hdr_key_dst); + if ( bcf_hdr_append(args->hdr_out, tmp.s) ) error("[%s:%d] failed to update the header\n",__FILE__,__LINE__); if (bcf_hdr_sync(args->hdr_out) < 0) error_errno("[%s] Failed to update header", __func__); int hdr_id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, col->hdr_key_dst); col->number = bcf_hdr_id2length(args->hdr_out,BCF_HL_INFO,hdr_id); @@ -2443,7 +2449,7 @@ static void init_columns(args_t *args) if ( k<0 ) error("[%s] Failed to parse the header, the ID attribute not found", __func__); tmp.l = 0; bcf_hrec_format(hrec, &tmp); - bcf_hdr_append(args->hdr_out, tmp.s); + if ( bcf_hdr_append(args->hdr_out, tmp.s) ) error("[%s:%d] failed to update the header\n",__FILE__,__LINE__); } if (bcf_hdr_sync(args->hdr_out) < 0) error_errno("[%s] Failed to update header", __func__); @@ -2477,7 +2483,7 @@ static void init_columns(args_t *args) if ( skip_info && khash_str2int_has_key(skip_info,hrec->vals[k]) ) continue; tmp.l = 0; bcf_hrec_format(hrec, &tmp); - bcf_hdr_append(args->hdr_out, tmp.s); + if ( bcf_hdr_append(args->hdr_out, tmp.s) ) error("[%s:%d] failed to update the header\n",__FILE__,__LINE__); if (bcf_hdr_sync(args->hdr_out) < 0) error_errno("[%s] Failed to update header", __func__); int hdr_id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, hrec->vals[k]); @@ -2513,7 +2519,7 @@ static void init_columns(args_t *args) if ( skip_fmt && khash_str2int_has_key(skip_fmt,hrec->vals[k]) ) continue; tmp.l = 0; bcf_hrec_format(hrec, &tmp); - bcf_hdr_append(args->hdr_out, tmp.s); + if ( bcf_hdr_append(args->hdr_out, tmp.s) ) error("[%s:%d] failed to update the header\n",__FILE__,__LINE__); if (bcf_hdr_sync(args->hdr_out) < 0) error_errno("[%s] Failed to update header", __func__); int hdr_id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, hrec->vals[k]); @@ -2561,7 +2567,7 @@ static void init_columns(args_t *args) if ( !hrec ) error("No such annotation \"%s\" in %s\n", key_src,args->targets_fname); tmp.l = 0; bcf_hrec_format_rename(hrec, key_dst, &tmp); - bcf_hdr_append(args->hdr_out, tmp.s); + if ( bcf_hdr_append(args->hdr_out, tmp.s) ) error("[%s:%d] failed to update the header\n",__FILE__,__LINE__); if (bcf_hdr_sync(args->hdr_out) < 0) error_errno("[%s] Failed to update header", __func__); } @@ -2668,13 +2674,13 @@ static void init_columns(args_t *args) { // transferring ID column into a new INFO tag tmp.l = 0; - ksprintf(&tmp,"##INFO=",key_dst); + ksprintf(&tmp,"##INFO=",key_dst); } else if ( !strcasecmp("FILTER",key_src) && !explicit_src_info ) { // transferring FILTER column into a new INFO tag tmp.l = 0; - ksprintf(&tmp,"##INFO=",key_dst); + ksprintf(&tmp,"##INFO=",key_dst); } else { @@ -2694,7 +2700,7 @@ static void init_columns(args_t *args) tmp.l = 0; bcf_hrec_format_rename(hrec, key_dst, &tmp); } - bcf_hdr_append(args->hdr_out, tmp.s); + if ( bcf_hdr_append(args->hdr_out, tmp.s) ) error("[%s:%d] failed to update the header\n",__FILE__,__LINE__); if (bcf_hdr_sync(args->hdr_out) < 0) error_errno("[%s] Failed to update header", __func__); hdr_id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, key_dst); @@ -3124,6 +3130,11 @@ static void init_data(args_t *args) &args->index_fn, args->write_index) < 0 ) error("Error: failed to initialise index for %s\n",args->output_fname); } + if ( args->tgts_is_vcf ) + { + if ( args->pair_logic==-1 ) args->pair_logic = BCF_SR_PAIR_SOME; + bcf_sr_set_opt(args->files,BCF_SR_PAIR_LOGIC,args->pair_logic); + } } static void destroy_data(args_t *args) @@ -3652,7 +3663,7 @@ static void usage(args_t *args) fprintf(bcftools_stderr, " --no-version Do not append version and command line to the header\n"); fprintf(bcftools_stderr, " -o, --output FILE Write output to a file [standard output]\n"); fprintf(bcftools_stderr, " -O, --output-type u|b|v|z[0-9] u/b: un/compressed BCF, v/z: un/compressed VCF, 0-9: compression level [v]\n"); - fprintf(bcftools_stderr, " --pair-logic STR Matching records by , see man page for details [some]\n"); + fprintf(bcftools_stderr, " --pair-logic STR Matching records by , see man page for details [some]\n"); fprintf(bcftools_stderr, " -r, --regions REGION Restrict to comma-separated list of regions\n"); fprintf(bcftools_stderr, " -R, --regions-file FILE Restrict to regions listed in FILE\n"); fprintf(bcftools_stderr, " --regions-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1]\n"); @@ -3663,6 +3674,7 @@ static void usage(args_t *args) fprintf(bcftools_stderr, " --single-overlaps Keep memory low by avoiding complexities arising from handling multiple overlapping intervals\n"); fprintf(bcftools_stderr, " -x, --remove LIST List of annotations (e.g. ID,INFO/DP,FORMAT/DP,FILTER) to remove (or keep with \"^\" prefix). See man page for details\n"); fprintf(bcftools_stderr, " --threads INT Number of extra output compression threads [0]\n"); + fprintf(bcftools_stderr, " -v, --verbosity INT Verbosity level\n"); fprintf(bcftools_stderr, " -W, --write-index[=FMT] Automatically index the output files [off]\n"); fprintf(bcftools_stderr, "\n"); fprintf(bcftools_stderr, "Examples:\n"); @@ -3720,13 +3732,17 @@ int main_vcfannotate(int argc, char *argv[]) {"min-overlap",required_argument,NULL,12}, {"no-version",no_argument,NULL,8}, {"force",no_argument,NULL,'f'}, + {"verbosity",required_argument,NULL,'v'}, {"write-index",optional_argument,NULL,'W'}, {NULL,0,NULL,0} }; char *tmp; - while ((c = getopt_long(argc, argv, "h:H:?o:O:r:R:a:x:c:C:i:e:S:s:I:m:kl:fW::",loptions,NULL)) >= 0) + while ((c = getopt_long(argc, argv, "h:H:?o:O:r:R:a:x:c:C:i:e:S:s:I:m:kl:fW::v:",loptions,NULL)) >= 0) { switch (c) { + case 'v': + if ( apply_verbosity(optarg) < 0 ) error("Could not parse argument: --verbosity %s\n", optarg); + break; case 'f': args->force = 1; break; case 'k': args->keep_sites = 1; break; case 'm': @@ -3786,6 +3802,7 @@ int main_vcfannotate(int argc, char *argv[]) else if ( !strcmp(optarg,"some") ) args->pair_logic |= BCF_SR_PAIR_SOME; else if ( !strcmp(optarg,"none") ) args->pair_logic = BCF_SR_PAIR_EXACT; else if ( !strcmp(optarg,"exact") ) args->pair_logic = BCF_SR_PAIR_EXACT; + else if ( !strcmp(optarg,"id") ) args->pair_logic |= BCF_SR_PAIR_ID; else error("The --pair-logic string \"%s\" not recognised.\n", optarg); break; case 3 : @@ -3831,7 +3848,6 @@ int main_vcfannotate(int argc, char *argv[]) { args->tgts_is_vcf = 1; args->files->require_index = 1; - bcf_sr_set_opt(args->files,BCF_SR_PAIR_LOGIC,args->pair_logic>=0 ? args->pair_logic : BCF_SR_PAIR_SOME); if ( args->min_overlap_str ) error("The --min-overlap option cannot be used when annotating from a VCF\n"); } } @@ -3839,10 +3855,19 @@ int main_vcfannotate(int argc, char *argv[]) if ( bcf_sr_set_threads(args->files, args->n_threads)<0 ) error("Failed to create threads\n"); if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to read from %s: %s\n", !strcmp("-",fname)?"standard input":fname,bcf_sr_strerror(args->files->errnum)); - static int line_errcode_warned = 0; + static int line_errcode_warned = 0, vcf_parse_error_warned = 0; init_data(args); while ( bcf_sr_next_line(args->files) ) { + if ( args->files->errnum ) + { + if ( !args->force ) error("Error: %s\n", bcf_sr_strerror(args->files->errnum)); + else if ( !vcf_parse_error_warned ) + { + fprintf(bcftools_stderr,"Warning: Encountered an error, proceeding only because --force was given.\n"); + vcf_parse_error_warned = 1; + } + } if ( !bcf_sr_has_line(args->files,0) ) continue; bcf1_t *line = bcf_sr_get_line(args->files,0); if ( line->errcode ) diff --git a/bcftools/vcfbuf.c b/bcftools/vcfbuf.c index 22390d0f..4dee727e 100644 --- a/bcftools/vcfbuf.c +++ b/bcftools/vcfbuf.c @@ -1,6 +1,6 @@ /* The MIT License - Copyright (c) 2016-2024 Genome Research Ltd. + Copyright (c) 2016-2025 Genome Research Ltd. Author: Petr Danecek @@ -49,9 +49,9 @@ typedef struct } vcfrec_t; -#define PRUNE_MODE_MAX_AF 1 -#define PRUNE_MODE_1ST 2 -#define PRUNE_MODE_RAND 3 +#define PRUNE_MODE_MAX_AF 1 +#define PRUNE_MODE_1ST 2 +#define PRUNE_MODE_RAND 3 typedef struct { int max_sites, mvrec, mac, mfarr, mode; @@ -62,6 +62,18 @@ typedef struct } prune_t; +#define CLUSTER_MODE_PRUNE 1 // remove cluster +#define CLUSTER_MODE_SIZE 2 // make cluster size available via vcfbuf_get_val(buf,int,CLUSTER_SIZE); +typedef struct +{ + int max_sites; // used with CLUSTER_PRUNE, removes cluster with more than this many sites within the window + int mode; // one of CLUSTER_MODE_PRUNE or CLUSTER_MODE_SIZE + int last; // the value of the currently removed element + int *size; // cluster size for this site + rbuf_t rbuf; +} +cluster_t; + #define MARK_OVERLAP 1 #define MARK_DUP 2 @@ -108,7 +120,7 @@ mark_t; struct _vcfbuf_t { - int win, // maximum number of sites in the buffer, either number of sites (<0) or bp (<0) + int win, // maximum number of sites in the buffer, either number of sites (>0) or bp (<0) dummy; // the caller maintains the buffer via push/peek/flush bcf_hdr_t *hdr; vcfrec_t *vcf; @@ -116,6 +128,7 @@ struct _vcfbuf_t ld_t ld; prune_t prune; mark_t mark; + cluster_t cluster; enum { clean, dirty } status; }; @@ -129,6 +142,8 @@ vcfbuf_t *vcfbuf_init(bcf_hdr_t *hdr, int win) int i; for (i=0; ild.max[i] = HUGE_VAL; rbuf_init(&buf->rbuf, 0); + rbuf_init(&buf->mark.rbuf, 0); + rbuf_init(&buf->cluster.rbuf, 0); return buf; } @@ -149,6 +164,7 @@ void vcfbuf_destroy(vcfbuf_t *buf) free(buf->mark.buf); free(buf->mark.buf_ptr); free(buf->mark.tmpi); + free(buf->cluster.size); free(buf); } @@ -193,6 +209,20 @@ int vcfbuf_set(vcfbuf_t *buf, vcfbuf_opt_t key, ...) va_end(args); return 0; + case CLUSTER_PRUNE: + va_start(args, key); + buf->cluster.max_sites = va_arg(args,int); + buf->cluster.mode = CLUSTER_MODE_PRUNE; + va_end(args); + return 0; + + case CLUSTER_SIZE: + va_start(args, key); + buf->cluster.max_sites = va_arg(args,int); + buf->cluster.mode = CLUSTER_MODE_SIZE; + va_end(args); + return 0; + case PRUNE_NSITES: va_start(args, key); buf->prune.max_sites = va_arg(args,int); @@ -252,6 +282,8 @@ void *vcfbuf_get(vcfbuf_t *buf, vcfbuf_opt_t key, ...) va_start(args, key); if ( key==MARK ) return &buf->mark.last; + if ( key==CLUSTER_SIZE ) + return &buf->cluster.last; va_end(args); return NULL; } @@ -638,6 +670,106 @@ static int mark_expr_can_flush_(vcfbuf_t *buf, int flush_all) return 1; } +int cluster_can_flush_(vcfbuf_t *buf, int flush_all) +{ + cluster_t *cluster = &buf->cluster; + +//{ int i; i=-1; while ( rbuf_next(&buf->rbuf,&i) ) fprintf(stderr," %d",(int)buf->vcf[i].rec->pos+1); fprintf(stderr," .. dirty=%d flush_all=%d\n",buf->status,flush_all); } + if ( buf->status==dirty ) + { + // a new site was just added by vcfbuf_push() + rbuf_expand0(&cluster->rbuf, int, buf->rbuf.n, cluster->size); + int i = rbuf_append(&cluster->rbuf); + cluster->size[i] = 0; + } + assert( cluster->rbuf.n==buf->rbuf.n ); + + // The following cases can occur: + // - if flush_all is set, then the entire buffer must be within the window + // - else if the last record can be on a different chr, then everything before can be flushed + // - else the last record can be either within or outside the window with respect to the first record + + + if ( buf->status==dirty ) + { + int ib = 0; + while ( ib < buf->rbuf.n ) + { + int b = rbuf_kth(&buf->rbuf, ib); + int ie = ib + 1; + while ( ie < buf->rbuf.n ) + { + int e = rbuf_kth(&buf->rbuf, ie); + if ( buf->vcf[b].rec->rid != buf->vcf[e].rec->rid ) break; + if ( buf->vcf[e].rec->pos - buf->vcf[b].rec->pos + 1 > -buf->win ) break; // win is negative + ie++; + } + // now ie is just outside the window or beyond the last element of the window + + // count the number of unfiltered sites that contribute to the cluster. Note this is inefficient, + // recalculating the same bits over and over, should be improved.. + int ix, nbuf = 0; + for (ix=ib; ixrbuf, ix); + if ( buf->vcf[x].filter ) continue; + nbuf++; + } + for (ix=ib; ixrbuf, ix); + if ( cluster->size[x] < nbuf ) cluster->size[x] = nbuf; + } + ib++; + } + buf->status = clean; + } + + int b = rbuf_kth(&buf->rbuf, 0); // first + int e = rbuf_last(&buf->rbuf); // last + int can_flush = flush_all; + if ( buf->vcf[b].rec->rid != buf->vcf[e].rec->rid ) can_flush = 1; + if ( buf->vcf[e].rec->pos - buf->vcf[b].rec->pos + 1 > -buf->win ) can_flush = 1; + if ( !can_flush ) return 0; + + if ( buf->cluster.mode==CLUSTER_MODE_PRUNE ) + { + int flush = 0; + while ( buf->rbuf.n ) + { + flush = 0; + int b = rbuf_kth(&buf->rbuf, 0); + int e = rbuf_kth(&buf->rbuf, -1); + if ( buf->vcf[b].filter ) + { + // not to be pruned, not counted as part of the cluster + flush = 1; + break; + } + + if ( flush_all ) flush = 1; + else if ( buf->vcf[b].rec->rid != buf->vcf[e].rec->rid ) flush = 1; + else if ( buf->vcf[e].rec->pos - buf->vcf[b].rec->pos + 1 > -buf->win ) flush = 1; + if ( !flush ) break; + + b = rbuf_kth(&cluster->rbuf, 0); + if ( cluster->size[b] <= cluster->max_sites ) break; // not to be pruned + + rbuf_remove_kth(&buf->rbuf, vcfrec_t, 0, buf->vcf); + rbuf_remove_kth(&cluster->rbuf, int, 0, cluster->size); + } + if ( !flush ) return 0; + } + + if ( !cluster->rbuf.n ) return 0; + + b = rbuf_shift(&cluster->rbuf); + cluster->last = cluster->size[b]; + b = rbuf_kth(&buf->rbuf, 0); + if ( buf->vcf[b].filter ) cluster->last = 0; + return 1; +} + bcf1_t *vcfbuf_flush(vcfbuf_t *buf, int flush_all) { int i,j; @@ -648,6 +780,13 @@ bcf1_t *vcfbuf_flush(vcfbuf_t *buf, int flush_all) // dummy mode, always flushing if ( buf->dummy ) goto ret; + // either annotate or print clustered sites + if ( buf->cluster.mode ) + { + if ( !cluster_can_flush_(buf,flush_all) ) return NULL; + goto ret; + } + // pruning mode if ( buf->win ) { diff --git a/bcftools/vcfbuf.c.pysam.c b/bcftools/vcfbuf.c.pysam.c index b74a5c49..ee7c9a70 100644 --- a/bcftools/vcfbuf.c.pysam.c +++ b/bcftools/vcfbuf.c.pysam.c @@ -2,7 +2,7 @@ /* The MIT License - Copyright (c) 2016-2024 Genome Research Ltd. + Copyright (c) 2016-2025 Genome Research Ltd. Author: Petr Danecek @@ -51,9 +51,9 @@ typedef struct } vcfrec_t; -#define PRUNE_MODE_MAX_AF 1 -#define PRUNE_MODE_1ST 2 -#define PRUNE_MODE_RAND 3 +#define PRUNE_MODE_MAX_AF 1 +#define PRUNE_MODE_1ST 2 +#define PRUNE_MODE_RAND 3 typedef struct { int max_sites, mvrec, mac, mfarr, mode; @@ -64,6 +64,18 @@ typedef struct } prune_t; +#define CLUSTER_MODE_PRUNE 1 // remove cluster +#define CLUSTER_MODE_SIZE 2 // make cluster size available via vcfbuf_get_val(buf,int,CLUSTER_SIZE); +typedef struct +{ + int max_sites; // used with CLUSTER_PRUNE, removes cluster with more than this many sites within the window + int mode; // one of CLUSTER_MODE_PRUNE or CLUSTER_MODE_SIZE + int last; // the value of the currently removed element + int *size; // cluster size for this site + rbuf_t rbuf; +} +cluster_t; + #define MARK_OVERLAP 1 #define MARK_DUP 2 @@ -110,7 +122,7 @@ mark_t; struct _vcfbuf_t { - int win, // maximum number of sites in the buffer, either number of sites (<0) or bp (<0) + int win, // maximum number of sites in the buffer, either number of sites (>0) or bp (<0) dummy; // the caller maintains the buffer via push/peek/flush bcf_hdr_t *hdr; vcfrec_t *vcf; @@ -118,6 +130,7 @@ struct _vcfbuf_t ld_t ld; prune_t prune; mark_t mark; + cluster_t cluster; enum { clean, dirty } status; }; @@ -131,6 +144,8 @@ vcfbuf_t *vcfbuf_init(bcf_hdr_t *hdr, int win) int i; for (i=0; ild.max[i] = HUGE_VAL; rbuf_init(&buf->rbuf, 0); + rbuf_init(&buf->mark.rbuf, 0); + rbuf_init(&buf->cluster.rbuf, 0); return buf; } @@ -151,6 +166,7 @@ void vcfbuf_destroy(vcfbuf_t *buf) free(buf->mark.buf); free(buf->mark.buf_ptr); free(buf->mark.tmpi); + free(buf->cluster.size); free(buf); } @@ -195,6 +211,20 @@ int vcfbuf_set(vcfbuf_t *buf, vcfbuf_opt_t key, ...) va_end(args); return 0; + case CLUSTER_PRUNE: + va_start(args, key); + buf->cluster.max_sites = va_arg(args,int); + buf->cluster.mode = CLUSTER_MODE_PRUNE; + va_end(args); + return 0; + + case CLUSTER_SIZE: + va_start(args, key); + buf->cluster.max_sites = va_arg(args,int); + buf->cluster.mode = CLUSTER_MODE_SIZE; + va_end(args); + return 0; + case PRUNE_NSITES: va_start(args, key); buf->prune.max_sites = va_arg(args,int); @@ -254,6 +284,8 @@ void *vcfbuf_get(vcfbuf_t *buf, vcfbuf_opt_t key, ...) va_start(args, key); if ( key==MARK ) return &buf->mark.last; + if ( key==CLUSTER_SIZE ) + return &buf->cluster.last; va_end(args); return NULL; } @@ -640,6 +672,106 @@ static int mark_expr_can_flush_(vcfbuf_t *buf, int flush_all) return 1; } +int cluster_can_flush_(vcfbuf_t *buf, int flush_all) +{ + cluster_t *cluster = &buf->cluster; + +//{ int i; i=-1; while ( rbuf_next(&buf->rbuf,&i) ) fprintf(bcftools_stderr," %d",(int)buf->vcf[i].rec->pos+1); fprintf(bcftools_stderr," .. dirty=%d flush_all=%d\n",buf->status,flush_all); } + if ( buf->status==dirty ) + { + // a new site was just added by vcfbuf_push() + rbuf_expand0(&cluster->rbuf, int, buf->rbuf.n, cluster->size); + int i = rbuf_append(&cluster->rbuf); + cluster->size[i] = 0; + } + assert( cluster->rbuf.n==buf->rbuf.n ); + + // The following cases can occur: + // - if flush_all is set, then the entire buffer must be within the window + // - else if the last record can be on a different chr, then everything before can be flushed + // - else the last record can be either within or outside the window with respect to the first record + + + if ( buf->status==dirty ) + { + int ib = 0; + while ( ib < buf->rbuf.n ) + { + int b = rbuf_kth(&buf->rbuf, ib); + int ie = ib + 1; + while ( ie < buf->rbuf.n ) + { + int e = rbuf_kth(&buf->rbuf, ie); + if ( buf->vcf[b].rec->rid != buf->vcf[e].rec->rid ) break; + if ( buf->vcf[e].rec->pos - buf->vcf[b].rec->pos + 1 > -buf->win ) break; // win is negative + ie++; + } + // now ie is just outside the window or beyond the last element of the window + + // count the number of unfiltered sites that contribute to the cluster. Note this is inefficient, + // recalculating the same bits over and over, should be improved.. + int ix, nbuf = 0; + for (ix=ib; ixrbuf, ix); + if ( buf->vcf[x].filter ) continue; + nbuf++; + } + for (ix=ib; ixrbuf, ix); + if ( cluster->size[x] < nbuf ) cluster->size[x] = nbuf; + } + ib++; + } + buf->status = clean; + } + + int b = rbuf_kth(&buf->rbuf, 0); // first + int e = rbuf_last(&buf->rbuf); // last + int can_flush = flush_all; + if ( buf->vcf[b].rec->rid != buf->vcf[e].rec->rid ) can_flush = 1; + if ( buf->vcf[e].rec->pos - buf->vcf[b].rec->pos + 1 > -buf->win ) can_flush = 1; + if ( !can_flush ) return 0; + + if ( buf->cluster.mode==CLUSTER_MODE_PRUNE ) + { + int flush = 0; + while ( buf->rbuf.n ) + { + flush = 0; + int b = rbuf_kth(&buf->rbuf, 0); + int e = rbuf_kth(&buf->rbuf, -1); + if ( buf->vcf[b].filter ) + { + // not to be pruned, not counted as part of the cluster + flush = 1; + break; + } + + if ( flush_all ) flush = 1; + else if ( buf->vcf[b].rec->rid != buf->vcf[e].rec->rid ) flush = 1; + else if ( buf->vcf[e].rec->pos - buf->vcf[b].rec->pos + 1 > -buf->win ) flush = 1; + if ( !flush ) break; + + b = rbuf_kth(&cluster->rbuf, 0); + if ( cluster->size[b] <= cluster->max_sites ) break; // not to be pruned + + rbuf_remove_kth(&buf->rbuf, vcfrec_t, 0, buf->vcf); + rbuf_remove_kth(&cluster->rbuf, int, 0, cluster->size); + } + if ( !flush ) return 0; + } + + if ( !cluster->rbuf.n ) return 0; + + b = rbuf_shift(&cluster->rbuf); + cluster->last = cluster->size[b]; + b = rbuf_kth(&buf->rbuf, 0); + if ( buf->vcf[b].filter ) cluster->last = 0; + return 1; +} + bcf1_t *vcfbuf_flush(vcfbuf_t *buf, int flush_all) { int i,j; @@ -650,6 +782,13 @@ bcf1_t *vcfbuf_flush(vcfbuf_t *buf, int flush_all) // dummy mode, always flushing if ( buf->dummy ) goto ret; + // either annotate or print clustered sites + if ( buf->cluster.mode ) + { + if ( !cluster_can_flush_(buf,flush_all) ) return NULL; + goto ret; + } + // pruning mode if ( buf->win ) { diff --git a/bcftools/vcfbuf.h b/bcftools/vcfbuf.h index 96d7115c..15054ff3 100644 --- a/bcftools/vcfbuf.h +++ b/bcftools/vcfbuf.h @@ -1,6 +1,6 @@ /* The MIT License - Copyright (c) 2017-2024 Genome Research Ltd. + Copyright (c) 2017-2025 Genome Research Ltd. Author: Petr Danecek @@ -45,6 +45,11 @@ typedef enum PRUNE_NSITES_MODE, // char *, maxAF (keep sites with max AF), 1st (sites that come first), rand (pick randomly) PRUNE_AF_TAG, // char *, use this INFO/AF tag with VCFBUF_NSITES + CLUSTER_PRUNE, // int, remove clusters of more than this many sites within the window + CLUSTER_SIZE, // w: int, if set, vcfbuf_get_val(buf,int,CLUSTER_SIZE) will be returning the cluster size + // r: use as in the example for MARK below. Returns positive values for valid sites, + // 0 for filtered sites + // duplicates and overlaps MARK, // w: char *, resolve overlaps by preferentially removing sites according to EXPR: // min(QUAL) .. remove sites with lowest QUAL until overlaps are resolved @@ -134,19 +139,20 @@ int vcfbuf_nsites(vcfbuf_t *buf); * Returns 0 on success or -1 if no values were filled. * * @val: will be filled with the values - * .. correlation coefficient r-squared - * .. Lewontin's D' (PMID: 19433632) - * .. Ragsdale's \hat{D} (doi:10.1093/molbev/msz265) + * r2 .. correlation coefficient r-squared + * LD .. Lewontin's D' (doi:10.1534/genetics.108.093153) + * RD,HD .. Ragsdale's \hat{D} (doi:10.1093/molbev/msz265) * @rec: corresponding positions or NULL if the value(s) has not been set */ #define VCFBUF_LD_N 3 #define VCFBUF_LD_IDX_R2 0 #define VCFBUF_LD_IDX_LD 1 #define VCFBUF_LD_IDX_HD 2 +#define VCFBUF_LD_IDX_RD 2 typedef struct { - double val[VCFBUF_LD_N]; // r2, ld, hd - bcf1_t *rec[VCFBUF_LD_N]; // record with max r2, ld, hd + double val[VCFBUF_LD_N]; // r2, ld, rd + bcf1_t *rec[VCFBUF_LD_N]; // record with max r2, ld, rd } vcfbuf_ld_t; int vcfbuf_ld(vcfbuf_t *buf, bcf1_t *rec, vcfbuf_ld_t *ld); diff --git a/bcftools/vcfcall.c b/bcftools/vcfcall.c index 13e516f8..7ea14366 100644 --- a/bcftools/vcfcall.c +++ b/bcftools/vcfcall.c @@ -1,6 +1,6 @@ /* vcfcall.c -- SNP/indel variant calling from VCF/BCF. - Copyright (C) 2013-2024 Genome Research Ltd. + Copyright (C) 2013-2025 Genome Research Ltd. Author: Petr Danecek @@ -112,18 +112,15 @@ typedef struct } args_t; -static char **add_sample(void *name2idx, char **lines, int *nlines, int *mlines, char *name, char sex, int *ith) +static char **add_sample(void *name2idx, char **lines, int *nlines, int *mlines, char *name, char *sex, int *ith) { int ret = khash_str2int_get(name2idx, name, ith); if ( ret==0 ) return lines; hts_expand(char*,(*nlines+1),*mlines,lines); - int len = strlen(name); - lines[*nlines] = (char*) malloc(len+3); - memcpy(lines[*nlines],name,len); - lines[*nlines][len] = ' '; - lines[*nlines][len+1] = sex; - lines[*nlines][len+2] = 0; + kstring_t str = {0,0,0}; + ksprintf(&str,"%s %s",name,sex); + lines[*nlines] = str.s; *ith = *nlines; (*nlines)++; khash_str2int_set(name2idx, strdup(name), *ith); @@ -205,12 +202,14 @@ static ploidy_predef_t ploidy_predefs[] = // only 5 columns are required and the first is ignored: // ignored,sample,father(or 0),mother(or 0),sex(1=M,2=F) -static char **parse_ped_samples(call_t *call, char **vals, int nvals, int *nsmpl) +static char **parse_ped_samples(args_t *args, call_t *call, char **vals, int nvals, int *nsmpl) { int i, j, mlines = 0, nlines = 0; kstring_t str = {0,0,0}, fam_str = {0,0,0}; void *name2idx = khash_str2int_init(); char **lines = NULL; + + char *msex = "M", *fsex = "F"; for (i=0; iploidy,sex)<0 ) + { + // this gender is not defined, if 1/2, test if M/F is + if ( !strcmp(sex,"1") && ploidy_sex2id(args->ploidy,msex)>=0 ) sex = msex; + else if ( !strcmp(sex,"2") && ploidy_sex2id(args->ploidy,fsex)>=0 ) sex = fsex; + else error("[E::%s] The sex \"%s\" has not been declared in --ploidy/--ploidy-file\n",__func__,sex); + } lines = add_sample(name2idx, lines, &nlines, &mlines, col_ends[0]+1, sex, &j); if ( strcmp(col_ends[1]+1,"0") && strcmp(col_ends[2]+1,"0") ) // father and mother { @@ -248,9 +250,9 @@ static char **parse_ped_samples(call_t *call, char **vals, int nvals, int *nsmpl fam->name = strdup(fam_str.s); if ( !khash_str2int_has_key(name2idx, col_ends[1]+1) ) - lines = add_sample(name2idx, lines, &nlines, &mlines, col_ends[1]+1, 'M', &fam->sample[FATHER]); + lines = add_sample(name2idx, lines, &nlines, &mlines, col_ends[1]+1, msex, &fam->sample[FATHER]); if ( !khash_str2int_has_key(name2idx, col_ends[2]+1) ) - lines = add_sample(name2idx, lines, &nlines, &mlines, col_ends[2]+1, 'F', &fam->sample[MOTHER]); + lines = add_sample(name2idx, lines, &nlines, &mlines, col_ends[2]+1, fsex, &fam->sample[MOTHER]); khash_str2int_get(name2idx, col_ends[0]+1, &fam->sample[CHILD]); khash_str2int_get(name2idx, col_ends[1]+1, &fam->sample[FATHER]); @@ -276,12 +278,17 @@ static char **parse_ped_samples(call_t *call, char **vals, int nvals, int *nsmpl */ static void set_samples(args_t *args, const char *fn, int is_file) { - int i, nlines; + int i, nlines, negate = 0; + if ( fn[0]=='^' ) + { + negate = 1; + fn++; + } char **lines = hts_readlist(fn, is_file, &nlines); if ( !lines ) error("Could not read the file: %s\n", fn); int nsmpls; - char **smpls = parse_ped_samples(&args->aux, lines, nlines, &nsmpls); + char **smpls = parse_ped_samples(args, &args->aux, lines, nlines, &nsmpls); if ( smpls ) { for (i=0; iaux.hdr); i++) args->sample2sex[i] = dflt_sex_id; int *old2new = (int*) malloc(sizeof(int)*bcf_hdr_nsamples(args->aux.hdr)); - for (i=0; iaux.hdr); i++) old2new[i] = -1; - int nsmpl = 0, map_needed = 0; - for (i=0; iaux.hdr, BCF_DT_SAMPLE, ss); - if ( ismpl < 0 ) { fprintf(stderr,"Warning: No such sample in the VCF: %s\n",ss); continue; } - if ( old2new[ismpl] != -1 ) { fprintf(stderr,"Warning: The sample is listed multiple times: %s\n",ss); continue; } - - ss = se+(x != '\0'); - while ( *ss && isspace(*ss) ) ss++; - if ( !*ss ) ss = "2"; // default ploidy - se = ss; - while ( *se && !isspace(*se) ) se++; - if ( se==ss ) { *xptr = x; error("Could not parse: \"%s\"\n", lines[i]); } - - if ( ss[1]==0 && (ss[0]=='0' || ss[0]=='1' || ss[0]=='2') ) - args->sample2sex[nsmpl] = -1*(ss[0]-'0'); - else - args->sample2sex[nsmpl] = ploidy_add_sex(args->ploidy, ss); + for (i=0; iaux.hdr); i++) old2new[i] = -1; + for (i=0; iaux.hdr, BCF_DT_SAMPLE, ss); + if ( ismpl < 0 ) { fprintf(stderr,"Warning: No such sample in the VCF: %s\n",ss); continue; } + if ( old2new[ismpl] != -1 ) { fprintf(stderr,"Warning: The sample is listed multiple times: %s\n",ss); continue; } + + ss = se+(x != '\0'); + while ( *ss && isspace(*ss) ) ss++; + if ( !*ss ) ss = "2"; // default ploidy + se = ss; + while ( *se && !isspace(*se) ) se++; + if ( se==ss ) { *xptr = x; error("Could not parse: \"%s\"\n", lines[i]); } + + char *sex = ss; + if ( ploidy_sex2id(args->ploidy,sex)<0 ) + { + if ( sex[1]==0 && (sex[0]=='0' || sex[0]=='1' || sex[0]=='2') ) args->sample2sex[nsmpl] = -1*(sex[0]-'0'); + else error("[E::%s] The sex \"%s\" has not been declared in --ploidy/--ploidy-file\n",__func__,sex); + } + else + args->sample2sex[nsmpl] = ploidy_add_sex(args->ploidy,sex); - if ( ismpl!=nsmpl ) map_needed = 1; - args->samples_map[nsmpl] = ismpl; - old2new[ismpl] = nsmpl; - nsmpl++; + if ( ismpl!=nsmpl ) map_needed = 1; + args->samples_map[nsmpl] = ismpl; + old2new[ismpl] = nsmpl; + nsmpl++; + } + if ( nsmpl!=bcf_hdr_nsamples(args->aux.hdr) ) map_needed = 1; + } + else + { + // negate: in this mode the default ploidy must be used for obvious reason - there is no way to + // specify ploidy if the sample name is not shown + for (i=0; iaux.hdr); i++) old2new[i] = 1; // by default keep the sample + for (i=0; iaux.hdr, BCF_DT_SAMPLE, ss); + if ( ismpl < 0 ) { fprintf(stderr,"Warning: No such sample in the VCF: %s\n",ss); continue; } + + old2new[ismpl] = 0; // do not keep this sample + free(lines[i]); + } + free(lines); + lines = malloc(sizeof(*lines)*bcf_hdr_nsamples(args->aux.hdr)); + nsmpl = 0; + for (i=0; iaux.hdr); i++) + { + if ( !old2new[i] ) continue; + lines[nsmpl] = strdup(args->aux.hdr->samples[i]); + args->samples_map[nsmpl] = i; + old2new[i] = nsmpl; + nsmpl++; + } + map_needed = 1; } for (i=0; iaux.nfams; i++) @@ -927,6 +975,7 @@ static void usage(args_t *args) fprintf(stderr, " -M, --keep-masked-ref Keep sites with masked reference allele (REF=N)\n"); fprintf(stderr, " -V, --skip-variants TYPE Skip indels/snps\n"); fprintf(stderr, " -v, --variants-only Output variant sites only\n"); + fprintf(stderr, " --verbosity INT Verbosity level\n"); fprintf(stderr, " -W, --write-index[=FMT] Automatically index the output files [off]\n"); fprintf(stderr, "\n"); fprintf(stderr, "Consensus/variant calling options:\n"); @@ -1012,6 +1061,7 @@ int main_vcfcall(int argc, char *argv[]) {"chromosome-Y",no_argument,NULL,'Y'}, {"no-version",no_argument,NULL,8}, {"write-index",optional_argument,NULL,'W'}, + {"verbosity",required_argument,NULL,10}, {NULL,0,NULL,0} }; @@ -1103,6 +1153,9 @@ int main_vcfcall(int argc, char *argv[]) if (!(args.write_index = write_index_parse(optarg))) error("Unsupported index format '%s'\n", optarg); break; + case 10: + if ( apply_verbosity(optarg) < 0 ) error("Could not parse argument: --verbosity %s\n", optarg); + break; default: usage(&args); } } diff --git a/bcftools/vcfcall.c.pysam.c b/bcftools/vcfcall.c.pysam.c index 0da344ef..3dab20d7 100644 --- a/bcftools/vcfcall.c.pysam.c +++ b/bcftools/vcfcall.c.pysam.c @@ -2,7 +2,7 @@ /* vcfcall.c -- SNP/indel variant calling from VCF/BCF. - Copyright (C) 2013-2024 Genome Research Ltd. + Copyright (C) 2013-2025 Genome Research Ltd. Author: Petr Danecek @@ -114,18 +114,15 @@ typedef struct } args_t; -static char **add_sample(void *name2idx, char **lines, int *nlines, int *mlines, char *name, char sex, int *ith) +static char **add_sample(void *name2idx, char **lines, int *nlines, int *mlines, char *name, char *sex, int *ith) { int ret = khash_str2int_get(name2idx, name, ith); if ( ret==0 ) return lines; hts_expand(char*,(*nlines+1),*mlines,lines); - int len = strlen(name); - lines[*nlines] = (char*) malloc(len+3); - memcpy(lines[*nlines],name,len); - lines[*nlines][len] = ' '; - lines[*nlines][len+1] = sex; - lines[*nlines][len+2] = 0; + kstring_t str = {0,0,0}; + ksprintf(&str,"%s %s",name,sex); + lines[*nlines] = str.s; *ith = *nlines; (*nlines)++; khash_str2int_set(name2idx, strdup(name), *ith); @@ -207,12 +204,14 @@ static ploidy_predef_t ploidy_predefs[] = // only 5 columns are required and the first is ignored: // ignored,sample,father(or 0),mother(or 0),sex(1=M,2=F) -static char **parse_ped_samples(call_t *call, char **vals, int nvals, int *nsmpl) +static char **parse_ped_samples(args_t *args, call_t *call, char **vals, int nvals, int *nsmpl) { int i, j, mlines = 0, nlines = 0; kstring_t str = {0,0,0}, fam_str = {0,0,0}; void *name2idx = khash_str2int_init(); char **lines = NULL; + + char *msex = "M", *fsex = "F"; for (i=0; iploidy,sex)<0 ) + { + // this gender is not defined, if 1/2, test if M/F is + if ( !strcmp(sex,"1") && ploidy_sex2id(args->ploidy,msex)>=0 ) sex = msex; + else if ( !strcmp(sex,"2") && ploidy_sex2id(args->ploidy,fsex)>=0 ) sex = fsex; + else error("[E::%s] The sex \"%s\" has not been declared in --ploidy/--ploidy-file\n",__func__,sex); + } lines = add_sample(name2idx, lines, &nlines, &mlines, col_ends[0]+1, sex, &j); if ( strcmp(col_ends[1]+1,"0") && strcmp(col_ends[2]+1,"0") ) // father and mother { @@ -250,9 +252,9 @@ static char **parse_ped_samples(call_t *call, char **vals, int nvals, int *nsmpl fam->name = strdup(fam_str.s); if ( !khash_str2int_has_key(name2idx, col_ends[1]+1) ) - lines = add_sample(name2idx, lines, &nlines, &mlines, col_ends[1]+1, 'M', &fam->sample[FATHER]); + lines = add_sample(name2idx, lines, &nlines, &mlines, col_ends[1]+1, msex, &fam->sample[FATHER]); if ( !khash_str2int_has_key(name2idx, col_ends[2]+1) ) - lines = add_sample(name2idx, lines, &nlines, &mlines, col_ends[2]+1, 'F', &fam->sample[MOTHER]); + lines = add_sample(name2idx, lines, &nlines, &mlines, col_ends[2]+1, fsex, &fam->sample[MOTHER]); khash_str2int_get(name2idx, col_ends[0]+1, &fam->sample[CHILD]); khash_str2int_get(name2idx, col_ends[1]+1, &fam->sample[FATHER]); @@ -278,12 +280,17 @@ static char **parse_ped_samples(call_t *call, char **vals, int nvals, int *nsmpl */ static void set_samples(args_t *args, const char *fn, int is_file) { - int i, nlines; + int i, nlines, negate = 0; + if ( fn[0]=='^' ) + { + negate = 1; + fn++; + } char **lines = hts_readlist(fn, is_file, &nlines); if ( !lines ) error("Could not read the file: %s\n", fn); int nsmpls; - char **smpls = parse_ped_samples(&args->aux, lines, nlines, &nsmpls); + char **smpls = parse_ped_samples(args, &args->aux, lines, nlines, &nsmpls); if ( smpls ) { for (i=0; iaux.hdr); i++) args->sample2sex[i] = dflt_sex_id; int *old2new = (int*) malloc(sizeof(int)*bcf_hdr_nsamples(args->aux.hdr)); - for (i=0; iaux.hdr); i++) old2new[i] = -1; - int nsmpl = 0, map_needed = 0; - for (i=0; iaux.hdr, BCF_DT_SAMPLE, ss); - if ( ismpl < 0 ) { fprintf(bcftools_stderr,"Warning: No such sample in the VCF: %s\n",ss); continue; } - if ( old2new[ismpl] != -1 ) { fprintf(bcftools_stderr,"Warning: The sample is listed multiple times: %s\n",ss); continue; } - - ss = se+(x != '\0'); - while ( *ss && isspace(*ss) ) ss++; - if ( !*ss ) ss = "2"; // default ploidy - se = ss; - while ( *se && !isspace(*se) ) se++; - if ( se==ss ) { *xptr = x; error("Could not parse: \"%s\"\n", lines[i]); } - - if ( ss[1]==0 && (ss[0]=='0' || ss[0]=='1' || ss[0]=='2') ) - args->sample2sex[nsmpl] = -1*(ss[0]-'0'); - else - args->sample2sex[nsmpl] = ploidy_add_sex(args->ploidy, ss); + for (i=0; iaux.hdr); i++) old2new[i] = -1; + for (i=0; iaux.hdr, BCF_DT_SAMPLE, ss); + if ( ismpl < 0 ) { fprintf(bcftools_stderr,"Warning: No such sample in the VCF: %s\n",ss); continue; } + if ( old2new[ismpl] != -1 ) { fprintf(bcftools_stderr,"Warning: The sample is listed multiple times: %s\n",ss); continue; } + + ss = se+(x != '\0'); + while ( *ss && isspace(*ss) ) ss++; + if ( !*ss ) ss = "2"; // default ploidy + se = ss; + while ( *se && !isspace(*se) ) se++; + if ( se==ss ) { *xptr = x; error("Could not parse: \"%s\"\n", lines[i]); } + + char *sex = ss; + if ( ploidy_sex2id(args->ploidy,sex)<0 ) + { + if ( sex[1]==0 && (sex[0]=='0' || sex[0]=='1' || sex[0]=='2') ) args->sample2sex[nsmpl] = -1*(sex[0]-'0'); + else error("[E::%s] The sex \"%s\" has not been declared in --ploidy/--ploidy-file\n",__func__,sex); + } + else + args->sample2sex[nsmpl] = ploidy_add_sex(args->ploidy,sex); - if ( ismpl!=nsmpl ) map_needed = 1; - args->samples_map[nsmpl] = ismpl; - old2new[ismpl] = nsmpl; - nsmpl++; + if ( ismpl!=nsmpl ) map_needed = 1; + args->samples_map[nsmpl] = ismpl; + old2new[ismpl] = nsmpl; + nsmpl++; + } + if ( nsmpl!=bcf_hdr_nsamples(args->aux.hdr) ) map_needed = 1; + } + else + { + // negate: in this mode the default ploidy must be used for obvious reason - there is no way to + // specify ploidy if the sample name is not shown + for (i=0; iaux.hdr); i++) old2new[i] = 1; // by default keep the sample + for (i=0; iaux.hdr, BCF_DT_SAMPLE, ss); + if ( ismpl < 0 ) { fprintf(bcftools_stderr,"Warning: No such sample in the VCF: %s\n",ss); continue; } + + old2new[ismpl] = 0; // do not keep this sample + free(lines[i]); + } + free(lines); + lines = malloc(sizeof(*lines)*bcf_hdr_nsamples(args->aux.hdr)); + nsmpl = 0; + for (i=0; iaux.hdr); i++) + { + if ( !old2new[i] ) continue; + lines[nsmpl] = strdup(args->aux.hdr->samples[i]); + args->samples_map[nsmpl] = i; + old2new[i] = nsmpl; + nsmpl++; + } + map_needed = 1; } for (i=0; iaux.nfams; i++) @@ -929,6 +977,7 @@ static void usage(args_t *args) fprintf(bcftools_stderr, " -M, --keep-masked-ref Keep sites with masked reference allele (REF=N)\n"); fprintf(bcftools_stderr, " -V, --skip-variants TYPE Skip indels/snps\n"); fprintf(bcftools_stderr, " -v, --variants-only Output variant sites only\n"); + fprintf(bcftools_stderr, " --verbosity INT Verbosity level\n"); fprintf(bcftools_stderr, " -W, --write-index[=FMT] Automatically index the output files [off]\n"); fprintf(bcftools_stderr, "\n"); fprintf(bcftools_stderr, "Consensus/variant calling options:\n"); @@ -1014,6 +1063,7 @@ int main_vcfcall(int argc, char *argv[]) {"chromosome-Y",no_argument,NULL,'Y'}, {"no-version",no_argument,NULL,8}, {"write-index",optional_argument,NULL,'W'}, + {"verbosity",required_argument,NULL,10}, {NULL,0,NULL,0} }; @@ -1105,6 +1155,9 @@ int main_vcfcall(int argc, char *argv[]) if (!(args.write_index = write_index_parse(optarg))) error("Unsupported index format '%s'\n", optarg); break; + case 10: + if ( apply_verbosity(optarg) < 0 ) error("Could not parse argument: --verbosity %s\n", optarg); + break; default: usage(&args); } } diff --git a/bcftools/vcfcnv.c b/bcftools/vcfcnv.c index e970b043..2e44a8e5 100644 --- a/bcftools/vcfcnv.c +++ b/bcftools/vcfcnv.c @@ -1,19 +1,19 @@ /* The MIT License - Copyright (c) 2014-2022 Genome Research Ltd. + Copyright (c) 2014-2025 Genome Research Ltd. Author: Petr Danecek - + Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - + The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE @@ -97,7 +97,7 @@ typedef struct _args_t uint32_t *sites; // positions [nsites,msites] int nsites, msites; - double baum_welch_th, optimize_frac; + double baum_welch_th, optimize_frac; float plot_th; FILE *summary_fh; char **argv, *regions_list, *summary_fname, *output_dir; @@ -144,7 +144,7 @@ static double *init_tprob_matrix(int ndim, double ij_prob, double same_prob) { // interpret ij_prob differently, as ii_prob in fact, so that for two // samples the behaviour is somewhat closer to single sample calling - // with s=0. + // with s=0. double pii = 1 - ij_prob*(N_STATES-1); ij_prob = (1 - pii) / (ndim - 1); for (j=0; jhdr)>1 ) error("Multi-sample VCF, missing the -s option\n"); args->query_sample.name = strdup(args->hdr->samples[0]); } - else + else if ( bcf_hdr_id2int(args->hdr,BCF_DT_SAMPLE,args->query_sample.name)<0 ) error("The sample \"%s\" not found\n", args->query_sample.name); if ( !args->files->readers[0].file->is_bin ) { @@ -283,7 +283,7 @@ static void init_data(args_t *args) } else args->summary_fh = NULL; // one sample only, no two-file summary - + int i; FILE *fh = args->summary_fh ? args->summary_fh : args->query_sample.summary_fh; @@ -391,7 +391,7 @@ static void plot_sample(args_t *args, sample_t *smpl) " plt.subplots_adjust(left=0.08,right=0.95,bottom=0.08,top=0.92)\n" " plt.savefig('%s/plot.%s.chr'+chr+'.png')\n" " plt.close()\n" - "\n", + "\n", smpl->dat_fname,smpl->cn_fname,smpl->name,args->output_dir,smpl->name ); fclose(fp); @@ -557,7 +557,7 @@ static void create_plots(args_t *args) " plt.subplots_adjust(left=0.08,right=0.95,bottom=0.08,top=0.92,hspace=0)\n" " plt.savefig('%s/plot.%s.%s.chr'+chr+'.png')\n" " plt.close()\n" - "\n", + "\n", args->control_sample.name,args->query_sample.name, args->output_dir, args->control_sample.dat_fname,args->query_sample.dat_fname, @@ -643,17 +643,17 @@ static int set_observed_prob(args_t *args, sample_t *smpl, int isite) return 0; } - double cn1_baf = + double cn1_baf = norm_prob(baf,GAUSS_CN1_PK_R(smpl)) * (fRR + fRA*0.5) + norm_prob(baf,GAUSS_CN1_PK_A(smpl)) * (fAA + fRA*0.5) ; - double cn2_baf = - norm_prob(baf,GAUSS_CN2_PK_RR(smpl)) * fRR + - norm_prob(baf,GAUSS_CN2_PK_RA(smpl)) * fRA + + double cn2_baf = + norm_prob(baf,GAUSS_CN2_PK_RR(smpl)) * fRR + + norm_prob(baf,GAUSS_CN2_PK_RA(smpl)) * fRA + norm_prob(baf,GAUSS_CN2_PK_AA(smpl)) * fAA; - double cn3_baf = - norm_prob(baf,GAUSS_CN3_PK_RRR(smpl)) * fRR + - norm_prob(baf,GAUSS_CN3_PK_RRA(smpl)) * fRA*0.5 + - norm_prob(baf,GAUSS_CN3_PK_RAA(smpl)) * fRA*0.5 + + double cn3_baf = + norm_prob(baf,GAUSS_CN3_PK_RRR(smpl)) * fRR + + norm_prob(baf,GAUSS_CN3_PK_RRA(smpl)) * fRA*0.5 + + norm_prob(baf,GAUSS_CN3_PK_RAA(smpl)) * fRA*0.5 + norm_prob(baf,GAUSS_CN3_PK_AAA(smpl)) * fAA; double norm = cn1_baf + cn2_baf + cn3_baf; @@ -1134,7 +1134,7 @@ static int parse_lrr_baf(sample_t *smpl, bcf_fmt_t *baf_fmt, bcf_fmt_t *lrr_fmt, static void cnv_next_line(args_t *args, bcf1_t *line) { - if ( !line ) + if ( !line ) { // Done, flush viterbi cnv_flush_viterbi(args); @@ -1154,7 +1154,7 @@ static void cnv_next_line(args_t *args, bcf1_t *line) args->ntot++; bcf_fmt_t *baf_fmt, *lrr_fmt = NULL; - if ( !(baf_fmt = bcf_get_fmt(args->hdr, line, "BAF")) ) return; + if ( !(baf_fmt = bcf_get_fmt(args->hdr, line, "BAF")) ) return; if ( args->lrr_bias>0 && !(lrr_fmt = bcf_get_fmt(args->hdr, line, "LRR")) ) return; float baf1,lrr1,baf2,lrr2; @@ -1226,6 +1226,7 @@ static void usage(args_t *args) fprintf(stderr, " -t, --targets REGION Similar to -r but streams rather than index-jumps\n"); fprintf(stderr, " -T, --targets-file FILE Similar to -R but streams rather than index-jumps\n"); fprintf(stderr, " --targets-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n"); + fprintf(stderr, " -v, --verbosity INT Verbosity level\n"); fprintf(stderr, "HMM Options:\n"); fprintf(stderr, " -a, --aberrant FLOAT[,FLOAT] Fraction of aberrant cells in query and control [1.0,1.0]\n"); fprintf(stderr, " -b, --BAF-weight FLOAT Relative contribution from BAF [1]\n"); @@ -1271,7 +1272,7 @@ int main_vcfcnv(int argc, char *argv[]) int regions_overlap = 1; int targets_overlap = 0; - static struct option loptions[] = + static struct option loptions[] = { {"BAF-dev",1,0,'d'}, {"LRR-dev",1,0,'k'}, @@ -1295,17 +1296,21 @@ int main_vcfcnv(int argc, char *argv[]) {"regions-overlap",required_argument,NULL,3}, {"plot-threshold",1,0,'p'}, {"output-dir",1,0,'o'}, + {"verbosity",required_argument,NULL,'v'}, {0,0,0,0} }; char *tmp = NULL; - while ((c = getopt_long(argc, argv, "h?r:R:t:T:s:o:p:l:T:c:b:P:x:e:O:W::f:a:L:d:k:",loptions,NULL)) >= 0) { + while ((c = getopt_long(argc, argv, "h?r:R:t:T:s:o:p:l:T:c:b:P:x:e:O:W::f:a:L:d:k:v:",loptions,NULL)) >= 0) { switch (c) { - case 'L': + case 'v': + if ( apply_verbosity(optarg) < 0 ) error("Could not parse argument: --verbosity %s\n", optarg); + break; + case 'L': args->lrr_smooth_win = strtol(optarg,&tmp,10); if ( *tmp ) error("Could not parse: --LRR-smooth-win %s\n", optarg); break; case 'f': args->af_fname = optarg; break; - case 'O': + case 'O': args->optimize_frac = strtod(optarg,&tmp); if ( *tmp ) error("Could not parse: -O %s\n", optarg); break; @@ -1348,27 +1353,27 @@ int main_vcfcnv(int argc, char *argv[]) args->baum_welch_th = strtod(optarg,&tmp); if ( *tmp ) error("Could not parse: -W %s\n", optarg); break; - case 'e': + case 'e': args->err_prob = strtod(optarg,&tmp); if ( *tmp ) error("Could not parse: -e %s\n", optarg); break; - case 'b': + case 'b': args->baf_bias = strtod(optarg,&tmp); if ( *tmp ) error("Could not parse: -b %s\n", optarg); break; - case 'x': + case 'x': args->ij_prob = strtod(optarg,&tmp); if ( *tmp ) error("Could not parse: -x %s\n", optarg); break; - case 'P': + case 'P': args->same_prob = strtod(optarg,&tmp); if ( *tmp ) error("Could not parse: -P %s\n", optarg); break; - case 'l': + case 'l': args->lrr_bias = strtod(optarg,&tmp); if ( *tmp ) error("Could not parse: -l %s\n", optarg); break; - case 'p': + case 'p': args->plot_th = strtod(optarg,&tmp); if ( *tmp ) error("Could not parse: -p %s\n", optarg); break; @@ -1387,7 +1392,7 @@ int main_vcfcnv(int argc, char *argv[]) targets_overlap = parse_overlap_option(optarg); if ( targets_overlap < 0 ) error("Could not parse: --targets-overlap %s\n",optarg); break; - case 'h': + case 'h': case '?': usage(args); break; default: error("Unknown argument: %s\n", optarg); } @@ -1421,7 +1426,7 @@ int main_vcfcnv(int argc, char *argv[]) } if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to read from %s: %s\n", !strcmp("-",fname)?"standard input":fname,bcf_sr_strerror(args->files->errnum)); - + init_data(args); while ( bcf_sr_next_line(args->files) ) { diff --git a/bcftools/vcfcnv.c.pysam.c b/bcftools/vcfcnv.c.pysam.c index d6ee4212..031ba53a 100644 --- a/bcftools/vcfcnv.c.pysam.c +++ b/bcftools/vcfcnv.c.pysam.c @@ -2,20 +2,20 @@ /* The MIT License - Copyright (c) 2014-2022 Genome Research Ltd. + Copyright (c) 2014-2025 Genome Research Ltd. Author: Petr Danecek - + Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - + The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE @@ -99,7 +99,7 @@ typedef struct _args_t uint32_t *sites; // positions [nsites,msites] int nsites, msites; - double baum_welch_th, optimize_frac; + double baum_welch_th, optimize_frac; float plot_th; FILE *summary_fh; char **argv, *regions_list, *summary_fname, *output_dir; @@ -146,7 +146,7 @@ static double *init_tprob_matrix(int ndim, double ij_prob, double same_prob) { // interpret ij_prob differently, as ii_prob in fact, so that for two // samples the behaviour is somewhat closer to single sample calling - // with s=0. + // with s=0. double pii = 1 - ij_prob*(N_STATES-1); ij_prob = (1 - pii) / (ndim - 1); for (j=0; jhdr)>1 ) error("Multi-sample VCF, missing the -s option\n"); args->query_sample.name = strdup(args->hdr->samples[0]); } - else + else if ( bcf_hdr_id2int(args->hdr,BCF_DT_SAMPLE,args->query_sample.name)<0 ) error("The sample \"%s\" not found\n", args->query_sample.name); if ( !args->files->readers[0].file->is_bin ) { @@ -285,7 +285,7 @@ static void init_data(args_t *args) } else args->summary_fh = NULL; // one sample only, no two-file summary - + int i; FILE *fh = args->summary_fh ? args->summary_fh : args->query_sample.summary_fh; @@ -393,7 +393,7 @@ static void plot_sample(args_t *args, sample_t *smpl) " plt.subplots_adjust(left=0.08,right=0.95,bottom=0.08,top=0.92)\n" " plt.savefig('%s/plot.%s.chr'+chr+'.png')\n" " plt.close()\n" - "\n", + "\n", smpl->dat_fname,smpl->cn_fname,smpl->name,args->output_dir,smpl->name ); fclose(fp); @@ -559,7 +559,7 @@ static void create_plots(args_t *args) " plt.subplots_adjust(left=0.08,right=0.95,bottom=0.08,top=0.92,hspace=0)\n" " plt.savefig('%s/plot.%s.%s.chr'+chr+'.png')\n" " plt.close()\n" - "\n", + "\n", args->control_sample.name,args->query_sample.name, args->output_dir, args->control_sample.dat_fname,args->query_sample.dat_fname, @@ -645,17 +645,17 @@ static int set_observed_prob(args_t *args, sample_t *smpl, int isite) return 0; } - double cn1_baf = + double cn1_baf = norm_prob(baf,GAUSS_CN1_PK_R(smpl)) * (fRR + fRA*0.5) + norm_prob(baf,GAUSS_CN1_PK_A(smpl)) * (fAA + fRA*0.5) ; - double cn2_baf = - norm_prob(baf,GAUSS_CN2_PK_RR(smpl)) * fRR + - norm_prob(baf,GAUSS_CN2_PK_RA(smpl)) * fRA + + double cn2_baf = + norm_prob(baf,GAUSS_CN2_PK_RR(smpl)) * fRR + + norm_prob(baf,GAUSS_CN2_PK_RA(smpl)) * fRA + norm_prob(baf,GAUSS_CN2_PK_AA(smpl)) * fAA; - double cn3_baf = - norm_prob(baf,GAUSS_CN3_PK_RRR(smpl)) * fRR + - norm_prob(baf,GAUSS_CN3_PK_RRA(smpl)) * fRA*0.5 + - norm_prob(baf,GAUSS_CN3_PK_RAA(smpl)) * fRA*0.5 + + double cn3_baf = + norm_prob(baf,GAUSS_CN3_PK_RRR(smpl)) * fRR + + norm_prob(baf,GAUSS_CN3_PK_RRA(smpl)) * fRA*0.5 + + norm_prob(baf,GAUSS_CN3_PK_RAA(smpl)) * fRA*0.5 + norm_prob(baf,GAUSS_CN3_PK_AAA(smpl)) * fAA; double norm = cn1_baf + cn2_baf + cn3_baf; @@ -1136,7 +1136,7 @@ static int parse_lrr_baf(sample_t *smpl, bcf_fmt_t *baf_fmt, bcf_fmt_t *lrr_fmt, static void cnv_next_line(args_t *args, bcf1_t *line) { - if ( !line ) + if ( !line ) { // Done, flush viterbi cnv_flush_viterbi(args); @@ -1156,7 +1156,7 @@ static void cnv_next_line(args_t *args, bcf1_t *line) args->ntot++; bcf_fmt_t *baf_fmt, *lrr_fmt = NULL; - if ( !(baf_fmt = bcf_get_fmt(args->hdr, line, "BAF")) ) return; + if ( !(baf_fmt = bcf_get_fmt(args->hdr, line, "BAF")) ) return; if ( args->lrr_bias>0 && !(lrr_fmt = bcf_get_fmt(args->hdr, line, "LRR")) ) return; float baf1,lrr1,baf2,lrr2; @@ -1228,6 +1228,7 @@ static void usage(args_t *args) fprintf(bcftools_stderr, " -t, --targets REGION Similar to -r but streams rather than index-jumps\n"); fprintf(bcftools_stderr, " -T, --targets-file FILE Similar to -R but streams rather than index-jumps\n"); fprintf(bcftools_stderr, " --targets-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n"); + fprintf(bcftools_stderr, " -v, --verbosity INT Verbosity level\n"); fprintf(bcftools_stderr, "HMM Options:\n"); fprintf(bcftools_stderr, " -a, --aberrant FLOAT[,FLOAT] Fraction of aberrant cells in query and control [1.0,1.0]\n"); fprintf(bcftools_stderr, " -b, --BAF-weight FLOAT Relative contribution from BAF [1]\n"); @@ -1273,7 +1274,7 @@ int main_vcfcnv(int argc, char *argv[]) int regions_overlap = 1; int targets_overlap = 0; - static struct option loptions[] = + static struct option loptions[] = { {"BAF-dev",1,0,'d'}, {"LRR-dev",1,0,'k'}, @@ -1297,17 +1298,21 @@ int main_vcfcnv(int argc, char *argv[]) {"regions-overlap",required_argument,NULL,3}, {"plot-threshold",1,0,'p'}, {"output-dir",1,0,'o'}, + {"verbosity",required_argument,NULL,'v'}, {0,0,0,0} }; char *tmp = NULL; - while ((c = getopt_long(argc, argv, "h?r:R:t:T:s:o:p:l:T:c:b:P:x:e:O:W::f:a:L:d:k:",loptions,NULL)) >= 0) { + while ((c = getopt_long(argc, argv, "h?r:R:t:T:s:o:p:l:T:c:b:P:x:e:O:W::f:a:L:d:k:v:",loptions,NULL)) >= 0) { switch (c) { - case 'L': + case 'v': + if ( apply_verbosity(optarg) < 0 ) error("Could not parse argument: --verbosity %s\n", optarg); + break; + case 'L': args->lrr_smooth_win = strtol(optarg,&tmp,10); if ( *tmp ) error("Could not parse: --LRR-smooth-win %s\n", optarg); break; case 'f': args->af_fname = optarg; break; - case 'O': + case 'O': args->optimize_frac = strtod(optarg,&tmp); if ( *tmp ) error("Could not parse: -O %s\n", optarg); break; @@ -1350,27 +1355,27 @@ int main_vcfcnv(int argc, char *argv[]) args->baum_welch_th = strtod(optarg,&tmp); if ( *tmp ) error("Could not parse: -W %s\n", optarg); break; - case 'e': + case 'e': args->err_prob = strtod(optarg,&tmp); if ( *tmp ) error("Could not parse: -e %s\n", optarg); break; - case 'b': + case 'b': args->baf_bias = strtod(optarg,&tmp); if ( *tmp ) error("Could not parse: -b %s\n", optarg); break; - case 'x': + case 'x': args->ij_prob = strtod(optarg,&tmp); if ( *tmp ) error("Could not parse: -x %s\n", optarg); break; - case 'P': + case 'P': args->same_prob = strtod(optarg,&tmp); if ( *tmp ) error("Could not parse: -P %s\n", optarg); break; - case 'l': + case 'l': args->lrr_bias = strtod(optarg,&tmp); if ( *tmp ) error("Could not parse: -l %s\n", optarg); break; - case 'p': + case 'p': args->plot_th = strtod(optarg,&tmp); if ( *tmp ) error("Could not parse: -p %s\n", optarg); break; @@ -1389,7 +1394,7 @@ int main_vcfcnv(int argc, char *argv[]) targets_overlap = parse_overlap_option(optarg); if ( targets_overlap < 0 ) error("Could not parse: --targets-overlap %s\n",optarg); break; - case 'h': + case 'h': case '?': usage(args); break; default: error("Unknown argument: %s\n", optarg); } @@ -1423,7 +1428,7 @@ int main_vcfcnv(int argc, char *argv[]) } if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to read from %s: %s\n", !strcmp("-",fname)?"standard input":fname,bcf_sr_strerror(args->files->errnum)); - + init_data(args); while ( bcf_sr_next_line(args->files) ) { diff --git a/bcftools/vcfconcat.c b/bcftools/vcfconcat.c index 232b3ae3..48afd20f 100644 --- a/bcftools/vcfconcat.c +++ b/bcftools/vcfconcat.c @@ -1,6 +1,6 @@ /* vcfconcat.c -- Concatenate or combine VCF/BCF files. - Copyright (C) 2013-2023 Genome Research Ltd. + Copyright (C) 2013-2025 Genome Research Ltd. Author: Petr Danecek @@ -641,7 +641,7 @@ static void concat(args_t *args) bcf_hdr_remove(hdr, BCF_HL_FMT, NULL); bcf_hdr_destroy(hdr_ori); } - if ( !fp->is_bin && args->output_type&FT_VCF ) + if ( !fp->is_bin && args->output_type&FT_VCF && !args->out_fh->idx) { line->max_unpack = BCF_UN_STR; // if VCF is on both input and output, avoid VCF to BCF conversion @@ -662,6 +662,7 @@ static void concat(args_t *args) } str++; } + fp->line.l = str - fp->line.s; str = fp->line.s; } while ( *str && *str!='\t' ) str++; @@ -918,7 +919,7 @@ static void naive_concat(args_t *args) // Output all non-header data that were read together with the header block if ( fp->block_length - nskip > 0 ) { - if ( bgzf_write(bgzf_out, (char *)fp->uncompressed_block+nskip, fp->block_length-nskip)<0 ) error("\nError: %d\n",fp->errcode); + if ( bgzf_write(bgzf_out, (char *)fp->uncompressed_block+nskip, fp->block_length-nskip)<0 ) error("\nError: %d\n",bgzf_out->errcode); } if ( bgzf_flush(bgzf_out)<0 ) error("\nError: %d\n",bgzf_out->errcode); @@ -951,7 +952,7 @@ static void naive_concat(args_t *args) } free(buf); free(tmp.s); - if (bgzf_close(bgzf_out) < 0) error("Error: %d\n",bgzf_out->errcode); + if (bgzf_close(bgzf_out) < 0) error("Error: %s\n",strerror(errno)); } static void usage(args_t *args) @@ -986,7 +987,7 @@ static void usage(args_t *args) fprintf(stderr, " -R, --regions-file FILE Restrict to regions listed in a file\n"); fprintf(stderr, " --regions-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1]\n"); fprintf(stderr, " --threads INT Use multithreading with worker threads [0]\n"); - fprintf(stderr, " -v, --verbose 0|1 Set verbosity level [1]\n"); + fprintf(stderr, " -v, --verbosity INT Set verbosity level\n"); fprintf(stderr, " -W, --write-index[=FMT] Automatically index the output files [off]\n"); fprintf(stderr, "\n"); exit(1); @@ -1008,6 +1009,7 @@ int main_vcfconcat(int argc, char *argv[]) static struct option loptions[] = { {"verbose",required_argument,NULL,'v'}, + {"verbosity",required_argument,NULL,'v'}, {"naive",no_argument,NULL,'n'}, {"naive-force",no_argument,NULL,7}, {"compact-PS",no_argument,NULL,'c'}, @@ -1080,6 +1082,7 @@ int main_vcfconcat(int argc, char *argv[]) case 'v': args->verbose = strtol(optarg, &tmp, 0); if ( *tmp || args->verbose<0 || args->verbose>1 ) error("Error: currently only --verbose 0 or --verbose 1 is supported\n"); + if ( args->verbose > 3 ) hts_verbose = args->verbose; break; case 'W': if (!(args->write_index = write_index_parse(optarg))) diff --git a/bcftools/vcfconcat.c.pysam.c b/bcftools/vcfconcat.c.pysam.c index d238dc04..6e4b47f3 100644 --- a/bcftools/vcfconcat.c.pysam.c +++ b/bcftools/vcfconcat.c.pysam.c @@ -2,7 +2,7 @@ /* vcfconcat.c -- Concatenate or combine VCF/BCF files. - Copyright (C) 2013-2023 Genome Research Ltd. + Copyright (C) 2013-2025 Genome Research Ltd. Author: Petr Danecek @@ -643,7 +643,7 @@ static void concat(args_t *args) bcf_hdr_remove(hdr, BCF_HL_FMT, NULL); bcf_hdr_destroy(hdr_ori); } - if ( !fp->is_bin && args->output_type&FT_VCF ) + if ( !fp->is_bin && args->output_type&FT_VCF && !args->out_fh->idx) { line->max_unpack = BCF_UN_STR; // if VCF is on both input and output, avoid VCF to BCF conversion @@ -664,6 +664,7 @@ static void concat(args_t *args) } str++; } + fp->line.l = str - fp->line.s; str = fp->line.s; } while ( *str && *str!='\t' ) str++; @@ -920,7 +921,7 @@ static void naive_concat(args_t *args) // Output all non-header data that were read together with the header block if ( fp->block_length - nskip > 0 ) { - if ( bgzf_write(bgzf_out, (char *)fp->uncompressed_block+nskip, fp->block_length-nskip)<0 ) error("\nError: %d\n",fp->errcode); + if ( bgzf_write(bgzf_out, (char *)fp->uncompressed_block+nskip, fp->block_length-nskip)<0 ) error("\nError: %d\n",bgzf_out->errcode); } if ( bgzf_flush(bgzf_out)<0 ) error("\nError: %d\n",bgzf_out->errcode); @@ -953,7 +954,7 @@ static void naive_concat(args_t *args) } free(buf); free(tmp.s); - if (bgzf_close(bgzf_out) < 0) error("Error: %d\n",bgzf_out->errcode); + if (bgzf_close(bgzf_out) < 0) error("Error: %s\n",strerror(errno)); } static void usage(args_t *args) @@ -988,7 +989,7 @@ static void usage(args_t *args) fprintf(bcftools_stderr, " -R, --regions-file FILE Restrict to regions listed in a file\n"); fprintf(bcftools_stderr, " --regions-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1]\n"); fprintf(bcftools_stderr, " --threads INT Use multithreading with worker threads [0]\n"); - fprintf(bcftools_stderr, " -v, --verbose 0|1 Set verbosity level [1]\n"); + fprintf(bcftools_stderr, " -v, --verbosity INT Set verbosity level\n"); fprintf(bcftools_stderr, " -W, --write-index[=FMT] Automatically index the output files [off]\n"); fprintf(bcftools_stderr, "\n"); bcftools_exit(1); @@ -1010,6 +1011,7 @@ int main_vcfconcat(int argc, char *argv[]) static struct option loptions[] = { {"verbose",required_argument,NULL,'v'}, + {"verbosity",required_argument,NULL,'v'}, {"naive",no_argument,NULL,'n'}, {"naive-force",no_argument,NULL,7}, {"compact-PS",no_argument,NULL,'c'}, @@ -1082,6 +1084,7 @@ int main_vcfconcat(int argc, char *argv[]) case 'v': args->verbose = strtol(optarg, &tmp, 0); if ( *tmp || args->verbose<0 || args->verbose>1 ) error("Error: currently only --verbose 0 or --verbose 1 is supported\n"); + if ( args->verbose > 3 ) hts_verbose = args->verbose; break; case 'W': if (!(args->write_index = write_index_parse(optarg))) diff --git a/bcftools/vcfconvert.c b/bcftools/vcfconvert.c index f75085aa..b01742ae 100644 --- a/bcftools/vcfconvert.c +++ b/bcftools/vcfconvert.c @@ -1,6 +1,6 @@ /* vcfconvert.c -- convert between VCF/BCF and related formats. - Copyright (C) 2013-2023 Genome Research Ltd. + Copyright (C) 2013-2025 Genome Research Ltd. Author: Petr Danecek @@ -1583,6 +1583,7 @@ static void gvcf_to_vcf(args_t *args) char *ref = faidx_fetch_seq(args->ref, (char*)bcf_hdr_id2name(hdr,line->rid), line->pos, line->pos, &len); if ( !ref ) error("faidx_fetch_seq failed at %s:%"PRId64"\n", bcf_hdr_id2name(hdr,line->rid),(int64_t) line->pos+1); strncpy(line->d.allele[0],ref,len); + bcf_update_alleles(hdr,line,(const char**)line->d.allele,line->n_allele); if ( bcf_write(out_fh,hdr,line)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname); free(ref); } @@ -1620,11 +1621,12 @@ static void usage(void) fprintf(stderr, " -T, --targets-file FILE Similar to -R but streams rather than index-jumps\n"); fprintf(stderr, " --targets-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n"); fprintf(stderr, "\n"); - fprintf(stderr, "VCF output options:\n"); + fprintf(stderr, "General options:\n"); fprintf(stderr, " --no-version Do not append version and command line to the header\n"); fprintf(stderr, " -o, --output FILE Output file name [stdout]\n"); fprintf(stderr, " -O, --output-type u|b|v|z[0-9] u/b: un/compressed BCF, v/z: un/compressed VCF, 0-9: compression level [v]\n"); fprintf(stderr, " --threads INT Use multithreading with INT worker threads [0]\n"); + fprintf(stderr, " -v, --verbosity INT Verbosity level\n"); fprintf(stderr, " -W, --write-index[=FMT] Automatically index the output files [off]\n"); fprintf(stderr, "\n"); fprintf(stderr, "GEN/SAMPLE conversion (input/output from IMPUTE2):\n"); @@ -1719,12 +1721,16 @@ int main_vcfconvert(int argc, char *argv[]) {"fasta-ref",required_argument,NULL,'f'}, {"no-version",no_argument,NULL,10}, {"keep-duplicates",no_argument,NULL,12}, + {"verbosity",required_argument,NULL,'v'}, {"write-index",optional_argument,NULL,'W'}, {NULL,0,NULL,0} }; char *tmp; - while ((c = getopt_long(argc, argv, "?h:r:R:s:S:t:T:i:e:g:G:o:O:c:f:H:W::",loptions,NULL)) >= 0) { + while ((c = getopt_long(argc, argv, "?h:r:R:s:S:t:T:i:e:g:G:o:O:c:f:H:W::v:",loptions,NULL)) >= 0) { switch (c) { + case 'v': + if ( apply_verbosity(optarg) < 0 ) error("Could not parse argument: --verbosity %s\n", optarg); + break; case 'e': if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n"); args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; diff --git a/bcftools/vcfconvert.c.pysam.c b/bcftools/vcfconvert.c.pysam.c index f8921bf6..07f7961c 100644 --- a/bcftools/vcfconvert.c.pysam.c +++ b/bcftools/vcfconvert.c.pysam.c @@ -2,7 +2,7 @@ /* vcfconvert.c -- convert between VCF/BCF and related formats. - Copyright (C) 2013-2023 Genome Research Ltd. + Copyright (C) 2013-2025 Genome Research Ltd. Author: Petr Danecek @@ -1585,6 +1585,7 @@ static void gvcf_to_vcf(args_t *args) char *ref = faidx_fetch_seq(args->ref, (char*)bcf_hdr_id2name(hdr,line->rid), line->pos, line->pos, &len); if ( !ref ) error("faidx_fetch_seq failed at %s:%"PRId64"\n", bcf_hdr_id2name(hdr,line->rid),(int64_t) line->pos+1); strncpy(line->d.allele[0],ref,len); + bcf_update_alleles(hdr,line,(const char**)line->d.allele,line->n_allele); if ( bcf_write(out_fh,hdr,line)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname); free(ref); } @@ -1622,11 +1623,12 @@ static void usage(void) fprintf(bcftools_stderr, " -T, --targets-file FILE Similar to -R but streams rather than index-jumps\n"); fprintf(bcftools_stderr, " --targets-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n"); fprintf(bcftools_stderr, "\n"); - fprintf(bcftools_stderr, "VCF output options:\n"); + fprintf(bcftools_stderr, "General options:\n"); fprintf(bcftools_stderr, " --no-version Do not append version and command line to the header\n"); fprintf(bcftools_stderr, " -o, --output FILE Output file name [bcftools_stdout]\n"); fprintf(bcftools_stderr, " -O, --output-type u|b|v|z[0-9] u/b: un/compressed BCF, v/z: un/compressed VCF, 0-9: compression level [v]\n"); fprintf(bcftools_stderr, " --threads INT Use multithreading with INT worker threads [0]\n"); + fprintf(bcftools_stderr, " -v, --verbosity INT Verbosity level\n"); fprintf(bcftools_stderr, " -W, --write-index[=FMT] Automatically index the output files [off]\n"); fprintf(bcftools_stderr, "\n"); fprintf(bcftools_stderr, "GEN/SAMPLE conversion (input/output from IMPUTE2):\n"); @@ -1721,12 +1723,16 @@ int main_vcfconvert(int argc, char *argv[]) {"fasta-ref",required_argument,NULL,'f'}, {"no-version",no_argument,NULL,10}, {"keep-duplicates",no_argument,NULL,12}, + {"verbosity",required_argument,NULL,'v'}, {"write-index",optional_argument,NULL,'W'}, {NULL,0,NULL,0} }; char *tmp; - while ((c = getopt_long(argc, argv, "?h:r:R:s:S:t:T:i:e:g:G:o:O:c:f:H:W::",loptions,NULL)) >= 0) { + while ((c = getopt_long(argc, argv, "?h:r:R:s:S:t:T:i:e:g:G:o:O:c:f:H:W::v:",loptions,NULL)) >= 0) { switch (c) { + case 'v': + if ( apply_verbosity(optarg) < 0 ) error("Could not parse argument: --verbosity %s\n", optarg); + break; case 'e': if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n"); args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; diff --git a/bcftools/vcffilter.c b/bcftools/vcffilter.c index 52d4f945..52d85ea7 100644 --- a/bcftools/vcffilter.c +++ b/bcftools/vcffilter.c @@ -1,6 +1,6 @@ /* vcffilter.c -- Apply fixed-threshold filters. - Copyright (C) 2013-2023 Genome Research Ltd. + Copyright (C) 2013-2025 Genome Research Ltd. Author: Petr Danecek @@ -493,6 +493,7 @@ static void usage(args_t *args) fprintf(stderr, " -T, --targets-file FILE Similar to -R but streams rather than index-jumps\n"); fprintf(stderr, " --targets-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n"); fprintf(stderr, " --threads INT Use multithreading with worker threads [0]\n"); + fprintf(stderr, " -v, --verbosity INT Verbosity level\n"); fprintf(stderr, " -W, --write-index[=FMT] Automatically index the output files [off]\n"); fprintf(stderr, "\n"); exit(1); @@ -537,11 +538,15 @@ int main_vcffilter(int argc, char *argv[]) {"IndelGap",required_argument,NULL,'G'}, {"no-version",no_argument,NULL,8}, {"write-index",optional_argument,NULL,'W'}, + {"verbosity",required_argument,NULL,'v'}, {NULL,0,NULL,0} }; char *tmp; - while ((c = getopt_long(argc, argv, "e:i:t:T:r:R:h?s:m:M:o:O:g:G:S:W::",loptions,NULL)) >= 0) { + while ((c = getopt_long(argc, argv, "e:i:t:T:r:R:h?s:m:M:o:O:g:G:S:W::v:",loptions,NULL)) >= 0) { switch (c) { + case 'v': + if ( apply_verbosity(optarg) < 0 ) error("Could not parse argument: --verbosity %s\n", optarg); + break; case 'g': args->snp_gap = strtol(optarg,&tmp,10); if ( *tmp && *tmp!=':' ) error("Could not parse argument: --SnpGap %s\n", optarg); diff --git a/bcftools/vcffilter.c.pysam.c b/bcftools/vcffilter.c.pysam.c index c240f799..b42a05fb 100644 --- a/bcftools/vcffilter.c.pysam.c +++ b/bcftools/vcffilter.c.pysam.c @@ -2,7 +2,7 @@ /* vcffilter.c -- Apply fixed-threshold filters. - Copyright (C) 2013-2023 Genome Research Ltd. + Copyright (C) 2013-2025 Genome Research Ltd. Author: Petr Danecek @@ -495,6 +495,7 @@ static void usage(args_t *args) fprintf(bcftools_stderr, " -T, --targets-file FILE Similar to -R but streams rather than index-jumps\n"); fprintf(bcftools_stderr, " --targets-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n"); fprintf(bcftools_stderr, " --threads INT Use multithreading with worker threads [0]\n"); + fprintf(bcftools_stderr, " -v, --verbosity INT Verbosity level\n"); fprintf(bcftools_stderr, " -W, --write-index[=FMT] Automatically index the output files [off]\n"); fprintf(bcftools_stderr, "\n"); bcftools_exit(1); @@ -539,11 +540,15 @@ int main_vcffilter(int argc, char *argv[]) {"IndelGap",required_argument,NULL,'G'}, {"no-version",no_argument,NULL,8}, {"write-index",optional_argument,NULL,'W'}, + {"verbosity",required_argument,NULL,'v'}, {NULL,0,NULL,0} }; char *tmp; - while ((c = getopt_long(argc, argv, "e:i:t:T:r:R:h?s:m:M:o:O:g:G:S:W::",loptions,NULL)) >= 0) { + while ((c = getopt_long(argc, argv, "e:i:t:T:r:R:h?s:m:M:o:O:g:G:S:W::v:",loptions,NULL)) >= 0) { switch (c) { + case 'v': + if ( apply_verbosity(optarg) < 0 ) error("Could not parse argument: --verbosity %s\n", optarg); + break; case 'g': args->snp_gap = strtol(optarg,&tmp,10); if ( *tmp && *tmp!=':' ) error("Could not parse argument: --SnpGap %s\n", optarg); diff --git a/bcftools/vcfgtcheck.c b/bcftools/vcfgtcheck.c index be886db3..6f7db004 100644 --- a/bcftools/vcfgtcheck.c +++ b/bcftools/vcfgtcheck.c @@ -1,6 +1,6 @@ /* vcfgtcheck.c -- Check sample identity. - Copyright (C) 2013-2024 Genome Research Ltd. + Copyright (C) 2013-2025 Genome Research Ltd. Author: Petr Danecek @@ -1176,6 +1176,7 @@ static void usage(void) fprintf(stderr, " -T, --targets-file FILE Similar to -R but streams rather than index-jumps\n"); fprintf(stderr, " --targets-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n"); fprintf(stderr, " -u, --use TAG1[,TAG2] Which tag to use in the query file (TAG1) and the -g file (TAG2) [PL,GT]\n"); + fprintf(stderr, " -v, --verbosity INT Verbosity level\n"); fprintf(stderr, "Examples:\n"); fprintf(stderr, " # Check discordance of all samples from B against all samples in A\n"); fprintf(stderr, " bcftools gtcheck -g A.bcf B.bcf\n"); @@ -1247,11 +1248,15 @@ int main_vcfgtcheck(int argc, char *argv[]) {"targets-overlap",required_argument,NULL,8}, {"pairs",1,0,'p'}, {"pairs-file",1,0,'P'}, + {"verbosity",required_argument,NULL,'v'}, {0,0,0,0} }; char *tmp; - while ((c = getopt_long(argc, argv, "hg:p:s:S:p:P:Hr:R:at:T:G:c:u:e:E:i:o:O:",loptions,NULL)) >= 0) { + while ((c = getopt_long(argc, argv, "hg:p:s:S:p:P:Hr:R:at:T:G:c:u:e:E:i:o:O:v:",loptions,NULL)) >= 0) { switch (c) { + case 'v': + if ( apply_verbosity(optarg) < 0 ) error("Could not parse argument: --verbosity %s\n", optarg); + break; case 'o': args->output_fname = optarg; break; case 'O': switch (optarg[0]) { diff --git a/bcftools/vcfgtcheck.c.pysam.c b/bcftools/vcfgtcheck.c.pysam.c index de7c6162..d5c9f2fa 100644 --- a/bcftools/vcfgtcheck.c.pysam.c +++ b/bcftools/vcfgtcheck.c.pysam.c @@ -2,7 +2,7 @@ /* vcfgtcheck.c -- Check sample identity. - Copyright (C) 2013-2024 Genome Research Ltd. + Copyright (C) 2013-2025 Genome Research Ltd. Author: Petr Danecek @@ -1178,6 +1178,7 @@ static void usage(void) fprintf(bcftools_stderr, " -T, --targets-file FILE Similar to -R but streams rather than index-jumps\n"); fprintf(bcftools_stderr, " --targets-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n"); fprintf(bcftools_stderr, " -u, --use TAG1[,TAG2] Which tag to use in the query file (TAG1) and the -g file (TAG2) [PL,GT]\n"); + fprintf(bcftools_stderr, " -v, --verbosity INT Verbosity level\n"); fprintf(bcftools_stderr, "Examples:\n"); fprintf(bcftools_stderr, " # Check discordance of all samples from B against all samples in A\n"); fprintf(bcftools_stderr, " bcftools gtcheck -g A.bcf B.bcf\n"); @@ -1249,11 +1250,15 @@ int main_vcfgtcheck(int argc, char *argv[]) {"targets-overlap",required_argument,NULL,8}, {"pairs",1,0,'p'}, {"pairs-file",1,0,'P'}, + {"verbosity",required_argument,NULL,'v'}, {0,0,0,0} }; char *tmp; - while ((c = getopt_long(argc, argv, "hg:p:s:S:p:P:Hr:R:at:T:G:c:u:e:E:i:o:O:",loptions,NULL)) >= 0) { + while ((c = getopt_long(argc, argv, "hg:p:s:S:p:P:Hr:R:at:T:G:c:u:e:E:i:o:O:v:",loptions,NULL)) >= 0) { switch (c) { + case 'v': + if ( apply_verbosity(optarg) < 0 ) error("Could not parse argument: --verbosity %s\n", optarg); + break; case 'o': args->output_fname = optarg; break; case 'O': switch (optarg[0]) { diff --git a/bcftools/vcfhead.c b/bcftools/vcfhead.c index 0b0222b5..a1815744 100644 --- a/bcftools/vcfhead.c +++ b/bcftools/vcfhead.c @@ -1,7 +1,7 @@ /* vcfhead.c -- view VCF/BCF file headers. Copyright (C) 2021 University of Glasgow. - Copyright (C) 2023 Genome Research Ltd. + Copyright (C) 2023-2025 Genome Research Ltd. Author: John Marshall @@ -42,15 +42,17 @@ int main_vcfhead(int argc, char *argv[]) "Usage: bcftools head [OPTION]... [FILE]\n" "\n" "Options:\n" -" -h, --headers INT Display INT header lines [all]\n" -" -n, --records INT Display INT variant record lines [none]\n" -" -s, --samples INT Display INT records starting with the #CHROM header line [none]\n" +" -h, --headers INT Display INT header lines [all]\n" +" -n, --records INT Display INT variant record lines [none]\n" +" -s, --samples INT Display INT records starting with the #CHROM header line [none]\n" +" -v, --verbosity INT Verbosity level\n" "\n"; static const struct option loptions[] = { { "headers", required_argument, NULL, 'h' }, { "records", required_argument, NULL, 'n' }, { "samples", required_argument, NULL, 's' }, + { "verbosity", required_argument, NULL, 'v' }, { NULL, 0, NULL, 0 } }; @@ -60,8 +62,11 @@ int main_vcfhead(int argc, char *argv[]) uint64_t nrecords = 0; int c, nargs; - while ((c = getopt_long(argc, argv, "h:n:s:", loptions, NULL)) >= 0) + while ((c = getopt_long(argc, argv, "h:n:s:v:", loptions, NULL)) >= 0) switch (c) { + case 'v': + if ( apply_verbosity(optarg) < 0 ) error("Could not parse argument: --verbosity %s\n", optarg); + break; case 'h': all_headers = 0; nheaders = strtoull(optarg, NULL, 0); break; case 'n': nrecords = strtoull(optarg, NULL, 0); break; case 's': nrecords = strtoull(optarg, NULL, 0); samples = 1; break; diff --git a/bcftools/vcfhead.c.pysam.c b/bcftools/vcfhead.c.pysam.c index 832c9bd7..e751a614 100644 --- a/bcftools/vcfhead.c.pysam.c +++ b/bcftools/vcfhead.c.pysam.c @@ -3,7 +3,7 @@ /* vcfhead.c -- view VCF/BCF file headers. Copyright (C) 2021 University of Glasgow. - Copyright (C) 2023 Genome Research Ltd. + Copyright (C) 2023-2025 Genome Research Ltd. Author: John Marshall @@ -44,15 +44,17 @@ int main_vcfhead(int argc, char *argv[]) "Usage: bcftools head [OPTION]... [FILE]\n" "\n" "Options:\n" -" -h, --headers INT Display INT header lines [all]\n" -" -n, --records INT Display INT variant record lines [none]\n" -" -s, --samples INT Display INT records starting with the #CHROM header line [none]\n" +" -h, --headers INT Display INT header lines [all]\n" +" -n, --records INT Display INT variant record lines [none]\n" +" -s, --samples INT Display INT records starting with the #CHROM header line [none]\n" +" -v, --verbosity INT Verbosity level\n" "\n"; static const struct option loptions[] = { { "headers", required_argument, NULL, 'h' }, { "records", required_argument, NULL, 'n' }, { "samples", required_argument, NULL, 's' }, + { "verbosity", required_argument, NULL, 'v' }, { NULL, 0, NULL, 0 } }; @@ -62,8 +64,11 @@ int main_vcfhead(int argc, char *argv[]) uint64_t nrecords = 0; int c, nargs; - while ((c = getopt_long(argc, argv, "h:n:s:", loptions, NULL)) >= 0) + while ((c = getopt_long(argc, argv, "h:n:s:v:", loptions, NULL)) >= 0) switch (c) { + case 'v': + if ( apply_verbosity(optarg) < 0 ) error("Could not parse argument: --verbosity %s\n", optarg); + break; case 'h': all_headers = 0; nheaders = strtoull(optarg, NULL, 0); break; case 'n': nrecords = strtoull(optarg, NULL, 0); break; case 's': nrecords = strtoull(optarg, NULL, 0); samples = 1; break; diff --git a/bcftools/vcfindex.c b/bcftools/vcfindex.c index 17eac5f3..d6029832 100644 --- a/bcftools/vcfindex.c +++ b/bcftools/vcfindex.c @@ -1,6 +1,6 @@ /* vcfindex.c -- Index bgzip compressed VCF/BCF files for random access. - Copyright (C) 2014-2024 Genome Research Ltd. + Copyright (C) 2014-2025 Genome Research Ltd. Author: Shane McCarthy @@ -57,6 +57,7 @@ static void usage(void) fprintf(stderr, " -o, --output FILE optional output index file name\n"); fprintf(stderr, " -t, --tbi generate TBI-format index for VCF files\n"); fprintf(stderr, " --threads INT use multithreading with INT worker threads [0]\n"); + fprintf(stderr, " -v, --verbosity INT verbosity level\n"); fprintf(stderr, "\n"); fprintf(stderr, "Stats options:\n"); fprintf(stderr, " -a, --all with --stats, print stats for all contigs even when zero\n"); @@ -236,16 +237,20 @@ int main_vcfindex(int argc, char *argv[]) {"stats",no_argument,NULL,'s'}, {"nrecords",no_argument,NULL,'n'}, {"threads",required_argument,NULL,9}, + {"verbosity",required_argument,NULL,'v'}, {"output-file",required_argument,NULL,'o'}, {"output",required_argument,NULL,'o'}, {NULL, 0, NULL, 0} }; char *tmp; - while ((c = getopt_long(argc, argv, "ctfm:snao:", loptions, NULL)) >= 0) + while ((c = getopt_long(argc, argv, "ctfm:snao:v:", loptions, NULL)) >= 0) { switch (c) { + case 'v': + if ( apply_verbosity(optarg) < 0 ) error("Could not parse argument: --verbosity %s\n", optarg); + break; case 'c': tbi = 0; break; case 't': tbi = 1; min_shift = 0; break; case 'f': force = 1; break; diff --git a/bcftools/vcfindex.c.pysam.c b/bcftools/vcfindex.c.pysam.c index 8f6932f1..e103cac0 100644 --- a/bcftools/vcfindex.c.pysam.c +++ b/bcftools/vcfindex.c.pysam.c @@ -2,7 +2,7 @@ /* vcfindex.c -- Index bgzip compressed VCF/BCF files for random access. - Copyright (C) 2014-2024 Genome Research Ltd. + Copyright (C) 2014-2025 Genome Research Ltd. Author: Shane McCarthy @@ -59,6 +59,7 @@ static void usage(void) fprintf(bcftools_stderr, " -o, --output FILE optional output index file name\n"); fprintf(bcftools_stderr, " -t, --tbi generate TBI-format index for VCF files\n"); fprintf(bcftools_stderr, " --threads INT use multithreading with INT worker threads [0]\n"); + fprintf(bcftools_stderr, " -v, --verbosity INT verbosity level\n"); fprintf(bcftools_stderr, "\n"); fprintf(bcftools_stderr, "Stats options:\n"); fprintf(bcftools_stderr, " -a, --all with --stats, print stats for all contigs even when zero\n"); @@ -238,16 +239,20 @@ int main_vcfindex(int argc, char *argv[]) {"stats",no_argument,NULL,'s'}, {"nrecords",no_argument,NULL,'n'}, {"threads",required_argument,NULL,9}, + {"verbosity",required_argument,NULL,'v'}, {"output-file",required_argument,NULL,'o'}, {"output",required_argument,NULL,'o'}, {NULL, 0, NULL, 0} }; char *tmp; - while ((c = getopt_long(argc, argv, "ctfm:snao:", loptions, NULL)) >= 0) + while ((c = getopt_long(argc, argv, "ctfm:snao:v:", loptions, NULL)) >= 0) { switch (c) { + case 'v': + if ( apply_verbosity(optarg) < 0 ) error("Could not parse argument: --verbosity %s\n", optarg); + break; case 'c': tbi = 0; break; case 't': tbi = 1; min_shift = 0; break; case 'f': force = 1; break; diff --git a/bcftools/vcfisec.c b/bcftools/vcfisec.c index 24a45685..51750c17 100644 --- a/bcftools/vcfisec.c +++ b/bcftools/vcfisec.c @@ -1,6 +1,6 @@ /* vcfisec.c -- Create intersections, unions and complements of VCF files. - Copyright (C) 2012-2023 Genome Research Ltd. + Copyright (C) 2012-2025 Genome Research Ltd. Author: Petr Danecek @@ -460,7 +460,7 @@ static void destroy_data(args_t *args) { if ( !args->fnames[i] ) continue; if ( hts_close(args->fh_out[i])!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->fnames[i]); - int is_tbi = !args->write_index + int is_tbi = !args->write_index || (args->write_index&127) == HTS_FMT_TBI; if ( args->output_type==FT_VCF_GZ && is_tbi ) { @@ -476,8 +476,8 @@ static void destroy_data(args_t *args) free(args->fh_out); free(args->fnames); if ( args->fh_sites ) fclose(args->fh_sites); - if ( args->write ) free(args->write); } + free(args->write); } static void usage(void) @@ -487,7 +487,7 @@ static void usage(void) fprintf(stderr, "Usage: bcftools isec [options] [...]\n"); fprintf(stderr, "\n"); fprintf(stderr, "Options:\n"); - fprintf(stderr, " -c, --collapse STRING Treat as identical records with , see man page for details [none]\n"); + fprintf(stderr, " -c, --collapse STRING Treat as identical records with , see man page for details [none]\n"); fprintf(stderr, " -C, --complement Output positions present only in the first file but missing in the others\n"); fprintf(stderr, " -e, --exclude EXPR Exclude sites for which the expression is true\n"); fprintf(stderr, " -f, --apply-filters LIST Require at least one of the listed FILTER strings (e.g. \"PASS,.\")\n"); @@ -504,7 +504,8 @@ static void usage(void) fprintf(stderr, " -t, --targets REGION Similar to -r but streams rather than index-jumps\n"); fprintf(stderr, " -T, --targets-file FILE Similar to -R but streams rather than index-jumps\n"); fprintf(stderr, " --targets-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n"); - fprintf(stderr, " --threads INT Use multithreading with worker threads [0]\n"); + fprintf(stderr, " --threads INT Use multithreading with INT worker threads [0]\n"); + fprintf(stderr, " -v, --verbosity INT Verbosity level\n"); fprintf(stderr, " -w, --write LIST List of files to write with -p given as 1-based indexes. By default, all files are written\n"); fprintf(stderr, " -W, --write-index[=FMT] Automatically index the output files [off]\n"); fprintf(stderr, "\n"); @@ -565,11 +566,15 @@ int main_vcfisec(int argc, char *argv[]) {"threads",required_argument,NULL,9}, {"no-version",no_argument,NULL,8}, {"write-index",optional_argument,NULL,'W'}, + {"verbosity",required_argument,NULL,'v'}, {NULL,0,NULL,0} }; char *tmp; - while ((c = getopt_long(argc, argv, "hc:r:R:p:n:w:t:T:Cf:o:O:i:e:l:W::",loptions,NULL)) >= 0) { + while ((c = getopt_long(argc, argv, "hc:r:R:p:n:w:t:T:Cf:o:O:i:e:l:W::v:",loptions,NULL)) >= 0) { switch (c) { + case 'v': + if ( apply_verbosity(optarg) < 0 ) error("Could not parse argument: --verbosity %s\n", optarg); + break; case 'o': args->output_fname = optarg; break; case 'O': switch (optarg[0]) { @@ -597,6 +602,7 @@ int main_vcfisec(int argc, char *argv[]) else if ( !strcmp(optarg,"all") ) args->files->collapse |= COLLAPSE_ANY; else if ( !strcmp(optarg,"some") ) args->files->collapse |= COLLAPSE_SOME; else if ( !strcmp(optarg,"none") ) args->files->collapse = COLLAPSE_NONE; + else if ( !strcmp(optarg,"id") ) args->files->collapse |= BCF_SR_PAIR_ID; else error("The --collapse string \"%s\" not recognised.\n", optarg); break; case 'f': args->files->apply_filters = optarg; break; diff --git a/bcftools/vcfisec.c.pysam.c b/bcftools/vcfisec.c.pysam.c index f4727c1d..f3c9e02d 100644 --- a/bcftools/vcfisec.c.pysam.c +++ b/bcftools/vcfisec.c.pysam.c @@ -2,7 +2,7 @@ /* vcfisec.c -- Create intersections, unions and complements of VCF files. - Copyright (C) 2012-2023 Genome Research Ltd. + Copyright (C) 2012-2025 Genome Research Ltd. Author: Petr Danecek @@ -462,7 +462,7 @@ static void destroy_data(args_t *args) { if ( !args->fnames[i] ) continue; if ( hts_close(args->fh_out[i])!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->fnames[i]); - int is_tbi = !args->write_index + int is_tbi = !args->write_index || (args->write_index&127) == HTS_FMT_TBI; if ( args->output_type==FT_VCF_GZ && is_tbi ) { @@ -478,8 +478,8 @@ static void destroy_data(args_t *args) free(args->fh_out); free(args->fnames); if ( args->fh_sites ) fclose(args->fh_sites); - if ( args->write ) free(args->write); } + free(args->write); } static void usage(void) @@ -489,7 +489,7 @@ static void usage(void) fprintf(bcftools_stderr, "Usage: bcftools isec [options] [...]\n"); fprintf(bcftools_stderr, "\n"); fprintf(bcftools_stderr, "Options:\n"); - fprintf(bcftools_stderr, " -c, --collapse STRING Treat as identical records with , see man page for details [none]\n"); + fprintf(bcftools_stderr, " -c, --collapse STRING Treat as identical records with , see man page for details [none]\n"); fprintf(bcftools_stderr, " -C, --complement Output positions present only in the first file but missing in the others\n"); fprintf(bcftools_stderr, " -e, --exclude EXPR Exclude sites for which the expression is true\n"); fprintf(bcftools_stderr, " -f, --apply-filters LIST Require at least one of the listed FILTER strings (e.g. \"PASS,.\")\n"); @@ -506,7 +506,8 @@ static void usage(void) fprintf(bcftools_stderr, " -t, --targets REGION Similar to -r but streams rather than index-jumps\n"); fprintf(bcftools_stderr, " -T, --targets-file FILE Similar to -R but streams rather than index-jumps\n"); fprintf(bcftools_stderr, " --targets-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n"); - fprintf(bcftools_stderr, " --threads INT Use multithreading with worker threads [0]\n"); + fprintf(bcftools_stderr, " --threads INT Use multithreading with INT worker threads [0]\n"); + fprintf(bcftools_stderr, " -v, --verbosity INT Verbosity level\n"); fprintf(bcftools_stderr, " -w, --write LIST List of files to write with -p given as 1-based indexes. By default, all files are written\n"); fprintf(bcftools_stderr, " -W, --write-index[=FMT] Automatically index the output files [off]\n"); fprintf(bcftools_stderr, "\n"); @@ -567,11 +568,15 @@ int main_vcfisec(int argc, char *argv[]) {"threads",required_argument,NULL,9}, {"no-version",no_argument,NULL,8}, {"write-index",optional_argument,NULL,'W'}, + {"verbosity",required_argument,NULL,'v'}, {NULL,0,NULL,0} }; char *tmp; - while ((c = getopt_long(argc, argv, "hc:r:R:p:n:w:t:T:Cf:o:O:i:e:l:W::",loptions,NULL)) >= 0) { + while ((c = getopt_long(argc, argv, "hc:r:R:p:n:w:t:T:Cf:o:O:i:e:l:W::v:",loptions,NULL)) >= 0) { switch (c) { + case 'v': + if ( apply_verbosity(optarg) < 0 ) error("Could not parse argument: --verbosity %s\n", optarg); + break; case 'o': args->output_fname = optarg; break; case 'O': switch (optarg[0]) { @@ -599,6 +604,7 @@ int main_vcfisec(int argc, char *argv[]) else if ( !strcmp(optarg,"all") ) args->files->collapse |= COLLAPSE_ANY; else if ( !strcmp(optarg,"some") ) args->files->collapse |= COLLAPSE_SOME; else if ( !strcmp(optarg,"none") ) args->files->collapse = COLLAPSE_NONE; + else if ( !strcmp(optarg,"id") ) args->files->collapse |= BCF_SR_PAIR_ID; else error("The --collapse string \"%s\" not recognised.\n", optarg); break; case 'f': args->files->apply_filters = optarg; break; diff --git a/bcftools/vcfmerge.c b/bcftools/vcfmerge.c index 3ca5f287..ce6a71c9 100644 --- a/bcftools/vcfmerge.c +++ b/bcftools/vcfmerge.c @@ -1,6 +1,6 @@ /* vcfmerge.c -- Merge multiple VCF/BCF files to create one multi-sample file. - Copyright (C) 2012-2024 Genome Research Ltd. + Copyright (C) 2012-2025 Genome Research Ltd. Author: Petr Danecek @@ -132,6 +132,7 @@ typedef struct int mrec; // allocated size of buf maux1_t *rec; // buffer to keep reader's lines bcf1_t **lines; // source buffer: either gvcf or readers' buffer + bcf_hdr_t *hdr; // this reader's header int var_types; // reader's variant types in the active [beg,end] window } buffer_t; @@ -871,7 +872,10 @@ maux_t *maux_init(args_t *args) ma->smpl_nGsize = (int*) malloc(n_smpl*sizeof(int)); ma->buf = (buffer_t*) calloc(ma->n,sizeof(buffer_t)); for (i=0; in; i++) + { ma->buf[i].rid = -1; + ma->buf[i].hdr = files->readers[i].header; + } ma->str = (kstring_t*) calloc(n_smpl,sizeof(kstring_t)); if ( args->local_alleles ) { @@ -1759,7 +1763,11 @@ void merge_GT(args_t *args, bcf_fmt_t **fmt_map, bcf1_t *out) type_t val = convert(&p_ori[k * sizeof(type_t)]); \ if ( val==vector_end ) break; /* smaller ploidy */ \ ma->smpl_ploidy[ismpl+j]++; \ - if ( bcf_gt_is_missing(val) ) tmp[k] = 0; /* missing allele */ \ + if ( bcf_gt_is_missing(val) ) \ + { \ + if ( bcf_gt_is_phased(val) ) tmp[k] = 1; /* missing allele, phased */ \ + else tmp[k] = 0; /* missing allele, unphased */ \ + } \ else tmp[k] = val; \ } \ for (; ksmpl_ploidy[ismpl+j]++; \ - if ( bcf_gt_is_missing(val) ) tmp[k] = 0; /* missing allele */ \ + if ( bcf_gt_is_missing(val) ) \ + { \ + if ( bcf_gt_is_phased(val) ) tmp[k] = 1; /* missing allele, phased */ \ + else tmp[k] = 0; /* missing allele, unphased */ \ + } \ else \ { \ int al = (val>>1) - 1; \ @@ -2711,20 +2723,33 @@ void gvcf_flush(args_t *args, int done) } } -static inline int is_gvcf_block(bcf1_t *line) +static inline int is_gvcf_block(args_t *args, bcf1_t *line) { + maux_t *ma; + if ( line->rlen<=1 ) return 0; if ( strlen(line->d.allele[0])==line->rlen ) return 0; - if ( line->n_allele==1 ) return 1; + if ( line->n_allele==1 ) goto is_gvcf; int i; for (i=1; in_allele; i++) { - if ( !strcmp(line->d.allele[i],"<*>") ) return 1; - if ( !strcmp(line->d.allele[i],"") ) return 1; - if ( !strcmp(line->d.allele[i],"") ) return 1; + if ( !strcmp(line->d.allele[i],"<*>") ) goto is_gvcf; + if ( !strcmp(line->d.allele[i],"") ) goto is_gvcf; + if ( !strcmp(line->d.allele[i],"") ) goto is_gvcf; } return 0; + +is_gvcf: + ma = args->maux; + if ( !ma->gvcf ) + { + args->do_gvcf = 1; + ma->gvcf = (gvcf_aux_t*) calloc(ma->n,sizeof(gvcf_aux_t)); // -Walloc-size-larger-than gives a harmless warning caused by signed integer ma->n + for (i=0; in; i++) + ma->gvcf[i].line = bcf_init1(); + } + return 1; } /* @@ -2764,7 +2789,7 @@ void gvcf_stage(args_t *args, int pos) int irec = maux->buf[i].beg; bcf_hdr_t *hdr = bcf_sr_get_header(files, i); bcf1_t *line = args->files->readers[i].buffer[irec]; - int ret = is_gvcf_block(line) ? bcf_get_info_int32(hdr,line,"END",&end,&nend) : 0; + int ret = is_gvcf_block(args,line) ? bcf_get_info_int32(hdr,line,"END",&end,&nend) : 0; if ( ret==1 ) { if ( end[0] == line->pos + 1 ) // POS and INFO/END are identical, treat as if a normal w/o INFO/END @@ -2917,23 +2942,48 @@ static const int indel_mask = (VCF_INDEL<<1), ins_mask = VCF_INS<<1, del_mask = VCF_DEL<<1, - ref_mask = 1; + ref_mask = 1, + other_mask = VCF_OTHER<<1; + +typedef struct +{ + int types, // selected types, see the *_mask(s) above + end; // if symbolic allele is involved, the END coordinate of the first record + bcf1_t *rec; // the first record selected +} +selected_t; // Can these types be merged given the -m settings? Despite the function's name, its focus is on // excluding incompatible records, there will be a finer matching later in stage_line() -static inline int types_compatible(args_t *args, int selected_types, buffer_t *buf, int irec) +static inline int types_compatible(args_t *args, selected_t *selected, buffer_t *buf, int irec) { int k; maux_t *maux = args->maux; bcf1_t *rec = buf->lines[irec]; int rec_types = buf->rec[irec].var_types; - assert( selected_types ); // this is trivially true, set in can_merge() + int end = -1; + if ( rec_types&other_mask ) + { + int32_t *itmp = NULL, nitmp = 0; + bcf_get_info_int32(buf->hdr,rec,"END",&itmp,&nitmp); + end = nitmp==1 ? itmp[0] : -1; + free(itmp); + } + + // First time here? + if ( !selected->types ) + { + selected->end = end; + selected->rec = rec; + selected->types = rec_types; + return 1; + } if ( args->collapse & COLLAPSE_ANY ) return 1; // can merge anything with anything // REF and gVCF_REF with no other alleles present can be merged with anything - if ( (selected_types&ref_mask) && !(selected_types&(~ref_mask)) ) return 1; + if ( (selected->types&ref_mask) && !(selected->types&(~ref_mask)) ) return 1; if ( (rec_types&ref_mask) && !(rec_types&(~ref_mask)) ) return 1; if ( args->collapse!=COLLAPSE_NONE ) @@ -2944,26 +2994,26 @@ static inline int types_compatible(args_t *args, int selected_types, buffer_t *b // - rec has indel, we already have an indel, and -m both,indels,snp-ins-del if ( args->collapse&(COLLAPSE_SNPS|COLLAPSE_SNP_INS_DEL) ) { - if ( (rec_types&snp_mask) && (selected_types&snp_mask) ) return 1; + if ( (rec_types&snp_mask) && (selected->types&snp_mask) ) return 1; } if ( args->collapse&COLLAPSE_INDELS ) { - if ( (rec_types&indel_mask) && (selected_types&indel_mask) ) return 1; + if ( (rec_types&indel_mask) && (selected->types&indel_mask) ) return 1; } if ( args->collapse&COLLAPSE_SNP_INS_DEL ) { - if ( (rec_types&ins_mask) && (selected_types&ins_mask) ) return 1; - if ( (rec_types&del_mask) && (selected_types&del_mask) ) return 1; + if ( (rec_types&ins_mask) && (selected->types&ins_mask) ) return 1; + if ( (rec_types&del_mask) && (selected->types&del_mask) ) return 1; } // Whatever is left, allow to match if the alleles match exactly } // The -m none mode or exact matching requested // Simple test first: are the variants of the same type? - int x = selected_types >> 1; // remove REF - int y = rec_types >> 1; // remove REF - while ( x && y ) { x>>=1; y>>=1; } - if ( x || y ) return 0; // the types differ + int x = selected->types; + int y = rec_types; + if ( !(x&y) ) return 0; // no matching type + if ( (x&y)!=x && (x&y)!=y ) return 0; // not a subset if ( vcmp_set_ref(args->vcmp,maux->als[0],rec->d.allele[0]) < 0 ) return 0; // refs are not compatible for (k=1; kn_allele; k++) @@ -2972,6 +3022,13 @@ static inline int types_compatible(args_t *args, int selected_types, buffer_t *b if ( vcmp_find_allele(args->vcmp,maux->als+1,maux->nals-1,rec->d.allele[k])>=0 ) break; } if ( k==rec->n_allele ) return 0; // this record has a new allele rec->d.allele[k] + + if ( selected->types&other_mask && rec_types&other_mask ) + { + // both records have symbolic alleles and the alleles are the same + if ( selected->end!=end ) return 0; + } + return 1; // all alleles in rec are also in the records selected thus far, perhaps save for gVCF_REF } @@ -3089,7 +3146,7 @@ int can_merge(args_t *args) var_type &= ~VCF_INDEL; } var_type = var_type ? var_type<<1 : ref_mask; - if ( args->do_gvcf && is_gvcf_block(line) ) var_type |= ref_mask; + if ( args->do_gvcf && is_gvcf_block(args,line) ) var_type |= ref_mask; buf->rec[j].var_types = var_type; } maux->var_types |= buf->rec[j].var_types; @@ -3098,7 +3155,7 @@ int can_merge(args_t *args) } if ( !ntodo ) return 0; - int selected_types = 0; + selected_t selected = {0,0,NULL}; // In this loop we select from each reader compatible candidate lines. // (i.e. SNPs or indels). Go through all files and all lines at this @@ -3113,7 +3170,7 @@ int can_merge(args_t *args) gaux[i].line->d.allele[0][0] = ref; gaux[i].line->pos = maux->pos; maux_update_alleles(args, i, buf->beg); - selected_types |= ref_mask; + selected.types |= ref_mask; continue; } for (j=buf->beg; jend; j++) @@ -3128,16 +3185,25 @@ int can_merge(args_t *args) { if ( strcmp(id,line->d.id) ) continue; // matching by ID and it does not match the selected record } - else if ( selected_types && !types_compatible(args,selected_types,buf,j) ) continue; - else - { - // First time here, choosing the first line: prioritize SNPs when available in the -m snps,both modes - if ( (args->collapse&COLLAPSE_SNPS || args->collapse==COLLAPSE_NONE) // asked to merge SNVs into multiallelics - && (maux->var_types&snp_mask) // there are SNVs at the current position - && !(buf->rec[j].var_types&(snp_mask|ref_mask)) // and this record is not a SNV nor ref - ) continue; - } - selected_types |= line_types; + else if ( !types_compatible(args,&selected,buf,j) ) continue; + + // This is not a good code. It makes the incorrect assumption of always having a SNP record available. + // However, that is not always the case and prevents the merging of G>GT,T with G>GT (see test/merge.multiallelics.1.*.vcf). + // We'd need to first check if it is possible to merge with something at all, and only then start excluding. + // Anyway, the can_merge() function should be about a *possibility*, one might argue that the priority should be handled in + // the stage_line() function. + // Commenting this out makes only one difference in our test case: reorders the output lines so that indels can come first. + // + // else + // { + // // First time here, choosing the first line: prioritize SNPs when available in the -m snps,both modes + // if ( (args->collapse&COLLAPSE_SNPS || args->collapse==COLLAPSE_NONE) // asked to merge SNVs into multiallelics + // && (maux->var_types&snp_mask) // there are SNVs at the current position + // && !(buf->rec[j].var_types&(snp_mask|ref_mask)) // and this record is not a SNV nor ref + // ) continue; + // } + + selected.types |= line_types; buf->rec[j].skip = 0; // the j-th record from i-th reader can be included. Final decision will be made in stage_line maux_update_alleles(args, i, j); @@ -3180,7 +3246,8 @@ void stage_line(args_t *args) if ( buf->rec[j].skip ) { int is_gvcf = maux->gvcf && maux->gvcf[i].active ? 1 : 0; - if ( !is_gvcf && is_gvcf_block(buf->lines[j]) ) is_gvcf = 1; + if ( !is_gvcf && is_gvcf_block(args,buf->lines[j]) ) is_gvcf = 1; + if ( is_gvcf && buf->rec[j].skip && !maux->gvcf[i].active ) continue; if ( !is_gvcf ) continue; // done or not compatible } if ( args->merge_by_id ) break; // if merging by ID and the line is compatible, the this is THE line @@ -3371,6 +3438,7 @@ void merge_vcf(args_t *args) if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno)); if ( args->n_threads ) hts_set_opt(args->out_fh, HTS_OPT_THREAD_POOL, args->files->p); //hts_set_threads(args->out_fh, args->n_threads); args->out_hdr = bcf_hdr_init("w"); + bcf_hdr_set_version(args->out_hdr, bcf_hdr_get_version(args->files->readers[0].header)); if ( args->header_fname ) { @@ -3392,7 +3460,6 @@ void merge_vcf(args_t *args) info_rules_init(args); missing_rules_init(args); - bcf_hdr_set_version(args->out_hdr, bcf_hdr_get_version(args->files->readers[0].header)); if ( bcf_hdr_write(args->out_fh, args->out_hdr)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); if ( args->header_only ) { @@ -3488,6 +3555,7 @@ static void usage(void) fprintf(stderr, " -R, --regions-file FILE Restrict to regions listed in a file\n"); fprintf(stderr, " --regions-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1]\n"); fprintf(stderr, " --threads INT Use multithreading with INT worker threads [0]\n"); + fprintf(stderr, " -v, --verbosity INT Verbosity level\n"); fprintf(stderr, " -W, --write-index[=FMT] Automatically index the output files [off]\n"); fprintf(stderr, "\n"); exit(1); @@ -3534,11 +3602,15 @@ int main_vcfmerge(int argc, char *argv[]) {"force-single",no_argument,NULL,12}, {"filter-logic",required_argument,NULL,'F'}, {"write-index",optional_argument,NULL,'W'}, + {"verbosity",required_argument,NULL,'v'}, {NULL,0,NULL,0} }; char *tmp; - while ((c = getopt_long(argc, argv, "hm:f:r:R:o:O:i:M:l:g:F:0L:W::",loptions,NULL)) >= 0) { + while ((c = getopt_long(argc, argv, "hm:f:r:R:o:O:i:M:l:g:F:0L:W::v:",loptions,NULL)) >= 0) { switch (c) { + case 'v': + if ( apply_verbosity(optarg) < 0 ) error("Could not parse argument: --verbosity %s\n", optarg); + break; case 'L': args->local_alleles = strtol(optarg,&tmp,10); if ( *tmp ) error("Could not parse argument: --local-alleles %s\n", optarg); diff --git a/bcftools/vcfmerge.c.pysam.c b/bcftools/vcfmerge.c.pysam.c index d0802d07..eb310b85 100644 --- a/bcftools/vcfmerge.c.pysam.c +++ b/bcftools/vcfmerge.c.pysam.c @@ -2,7 +2,7 @@ /* vcfmerge.c -- Merge multiple VCF/BCF files to create one multi-sample file. - Copyright (C) 2012-2024 Genome Research Ltd. + Copyright (C) 2012-2025 Genome Research Ltd. Author: Petr Danecek @@ -134,6 +134,7 @@ typedef struct int mrec; // allocated size of buf maux1_t *rec; // buffer to keep reader's lines bcf1_t **lines; // source buffer: either gvcf or readers' buffer + bcf_hdr_t *hdr; // this reader's header int var_types; // reader's variant types in the active [beg,end] window } buffer_t; @@ -873,7 +874,10 @@ maux_t *maux_init(args_t *args) ma->smpl_nGsize = (int*) malloc(n_smpl*sizeof(int)); ma->buf = (buffer_t*) calloc(ma->n,sizeof(buffer_t)); for (i=0; in; i++) + { ma->buf[i].rid = -1; + ma->buf[i].hdr = files->readers[i].header; + } ma->str = (kstring_t*) calloc(n_smpl,sizeof(kstring_t)); if ( args->local_alleles ) { @@ -1761,7 +1765,11 @@ void merge_GT(args_t *args, bcf_fmt_t **fmt_map, bcf1_t *out) type_t val = convert(&p_ori[k * sizeof(type_t)]); \ if ( val==vector_end ) break; /* smaller ploidy */ \ ma->smpl_ploidy[ismpl+j]++; \ - if ( bcf_gt_is_missing(val) ) tmp[k] = 0; /* missing allele */ \ + if ( bcf_gt_is_missing(val) ) \ + { \ + if ( bcf_gt_is_phased(val) ) tmp[k] = 1; /* missing allele, phased */ \ + else tmp[k] = 0; /* missing allele, unphased */ \ + } \ else tmp[k] = val; \ } \ for (; ksmpl_ploidy[ismpl+j]++; \ - if ( bcf_gt_is_missing(val) ) tmp[k] = 0; /* missing allele */ \ + if ( bcf_gt_is_missing(val) ) \ + { \ + if ( bcf_gt_is_phased(val) ) tmp[k] = 1; /* missing allele, phased */ \ + else tmp[k] = 0; /* missing allele, unphased */ \ + } \ else \ { \ int al = (val>>1) - 1; \ @@ -2713,20 +2725,33 @@ void gvcf_flush(args_t *args, int done) } } -static inline int is_gvcf_block(bcf1_t *line) +static inline int is_gvcf_block(args_t *args, bcf1_t *line) { + maux_t *ma; + if ( line->rlen<=1 ) return 0; if ( strlen(line->d.allele[0])==line->rlen ) return 0; - if ( line->n_allele==1 ) return 1; + if ( line->n_allele==1 ) goto is_gvcf; int i; for (i=1; in_allele; i++) { - if ( !strcmp(line->d.allele[i],"<*>") ) return 1; - if ( !strcmp(line->d.allele[i],"") ) return 1; - if ( !strcmp(line->d.allele[i],"") ) return 1; + if ( !strcmp(line->d.allele[i],"<*>") ) goto is_gvcf; + if ( !strcmp(line->d.allele[i],"") ) goto is_gvcf; + if ( !strcmp(line->d.allele[i],"") ) goto is_gvcf; } return 0; + +is_gvcf: + ma = args->maux; + if ( !ma->gvcf ) + { + args->do_gvcf = 1; + ma->gvcf = (gvcf_aux_t*) calloc(ma->n,sizeof(gvcf_aux_t)); // -Walloc-size-larger-than gives a harmless warning caused by signed integer ma->n + for (i=0; in; i++) + ma->gvcf[i].line = bcf_init1(); + } + return 1; } /* @@ -2766,7 +2791,7 @@ void gvcf_stage(args_t *args, int pos) int irec = maux->buf[i].beg; bcf_hdr_t *hdr = bcf_sr_get_header(files, i); bcf1_t *line = args->files->readers[i].buffer[irec]; - int ret = is_gvcf_block(line) ? bcf_get_info_int32(hdr,line,"END",&end,&nend) : 0; + int ret = is_gvcf_block(args,line) ? bcf_get_info_int32(hdr,line,"END",&end,&nend) : 0; if ( ret==1 ) { if ( end[0] == line->pos + 1 ) // POS and INFO/END are identical, treat as if a normal w/o INFO/END @@ -2919,23 +2944,48 @@ static const int indel_mask = (VCF_INDEL<<1), ins_mask = VCF_INS<<1, del_mask = VCF_DEL<<1, - ref_mask = 1; + ref_mask = 1, + other_mask = VCF_OTHER<<1; + +typedef struct +{ + int types, // selected types, see the *_mask(s) above + end; // if symbolic allele is involved, the END coordinate of the first record + bcf1_t *rec; // the first record selected +} +selected_t; // Can these types be merged given the -m settings? Despite the function's name, its focus is on // excluding incompatible records, there will be a finer matching later in stage_line() -static inline int types_compatible(args_t *args, int selected_types, buffer_t *buf, int irec) +static inline int types_compatible(args_t *args, selected_t *selected, buffer_t *buf, int irec) { int k; maux_t *maux = args->maux; bcf1_t *rec = buf->lines[irec]; int rec_types = buf->rec[irec].var_types; - assert( selected_types ); // this is trivially true, set in can_merge() + int end = -1; + if ( rec_types&other_mask ) + { + int32_t *itmp = NULL, nitmp = 0; + bcf_get_info_int32(buf->hdr,rec,"END",&itmp,&nitmp); + end = nitmp==1 ? itmp[0] : -1; + free(itmp); + } + + // First time here? + if ( !selected->types ) + { + selected->end = end; + selected->rec = rec; + selected->types = rec_types; + return 1; + } if ( args->collapse & COLLAPSE_ANY ) return 1; // can merge anything with anything // REF and gVCF_REF with no other alleles present can be merged with anything - if ( (selected_types&ref_mask) && !(selected_types&(~ref_mask)) ) return 1; + if ( (selected->types&ref_mask) && !(selected->types&(~ref_mask)) ) return 1; if ( (rec_types&ref_mask) && !(rec_types&(~ref_mask)) ) return 1; if ( args->collapse!=COLLAPSE_NONE ) @@ -2946,26 +2996,26 @@ static inline int types_compatible(args_t *args, int selected_types, buffer_t *b // - rec has indel, we already have an indel, and -m both,indels,snp-ins-del if ( args->collapse&(COLLAPSE_SNPS|COLLAPSE_SNP_INS_DEL) ) { - if ( (rec_types&snp_mask) && (selected_types&snp_mask) ) return 1; + if ( (rec_types&snp_mask) && (selected->types&snp_mask) ) return 1; } if ( args->collapse&COLLAPSE_INDELS ) { - if ( (rec_types&indel_mask) && (selected_types&indel_mask) ) return 1; + if ( (rec_types&indel_mask) && (selected->types&indel_mask) ) return 1; } if ( args->collapse&COLLAPSE_SNP_INS_DEL ) { - if ( (rec_types&ins_mask) && (selected_types&ins_mask) ) return 1; - if ( (rec_types&del_mask) && (selected_types&del_mask) ) return 1; + if ( (rec_types&ins_mask) && (selected->types&ins_mask) ) return 1; + if ( (rec_types&del_mask) && (selected->types&del_mask) ) return 1; } // Whatever is left, allow to match if the alleles match exactly } // The -m none mode or exact matching requested // Simple test first: are the variants of the same type? - int x = selected_types >> 1; // remove REF - int y = rec_types >> 1; // remove REF - while ( x && y ) { x>>=1; y>>=1; } - if ( x || y ) return 0; // the types differ + int x = selected->types; + int y = rec_types; + if ( !(x&y) ) return 0; // no matching type + if ( (x&y)!=x && (x&y)!=y ) return 0; // not a subset if ( vcmp_set_ref(args->vcmp,maux->als[0],rec->d.allele[0]) < 0 ) return 0; // refs are not compatible for (k=1; kn_allele; k++) @@ -2974,6 +3024,13 @@ static inline int types_compatible(args_t *args, int selected_types, buffer_t *b if ( vcmp_find_allele(args->vcmp,maux->als+1,maux->nals-1,rec->d.allele[k])>=0 ) break; } if ( k==rec->n_allele ) return 0; // this record has a new allele rec->d.allele[k] + + if ( selected->types&other_mask && rec_types&other_mask ) + { + // both records have symbolic alleles and the alleles are the same + if ( selected->end!=end ) return 0; + } + return 1; // all alleles in rec are also in the records selected thus far, perhaps save for gVCF_REF } @@ -3091,7 +3148,7 @@ int can_merge(args_t *args) var_type &= ~VCF_INDEL; } var_type = var_type ? var_type<<1 : ref_mask; - if ( args->do_gvcf && is_gvcf_block(line) ) var_type |= ref_mask; + if ( args->do_gvcf && is_gvcf_block(args,line) ) var_type |= ref_mask; buf->rec[j].var_types = var_type; } maux->var_types |= buf->rec[j].var_types; @@ -3100,7 +3157,7 @@ int can_merge(args_t *args) } if ( !ntodo ) return 0; - int selected_types = 0; + selected_t selected = {0,0,NULL}; // In this loop we select from each reader compatible candidate lines. // (i.e. SNPs or indels). Go through all files and all lines at this @@ -3115,7 +3172,7 @@ int can_merge(args_t *args) gaux[i].line->d.allele[0][0] = ref; gaux[i].line->pos = maux->pos; maux_update_alleles(args, i, buf->beg); - selected_types |= ref_mask; + selected.types |= ref_mask; continue; } for (j=buf->beg; jend; j++) @@ -3130,16 +3187,25 @@ int can_merge(args_t *args) { if ( strcmp(id,line->d.id) ) continue; // matching by ID and it does not match the selected record } - else if ( selected_types && !types_compatible(args,selected_types,buf,j) ) continue; - else - { - // First time here, choosing the first line: prioritize SNPs when available in the -m snps,both modes - if ( (args->collapse&COLLAPSE_SNPS || args->collapse==COLLAPSE_NONE) // asked to merge SNVs into multiallelics - && (maux->var_types&snp_mask) // there are SNVs at the current position - && !(buf->rec[j].var_types&(snp_mask|ref_mask)) // and this record is not a SNV nor ref - ) continue; - } - selected_types |= line_types; + else if ( !types_compatible(args,&selected,buf,j) ) continue; + + // This is not a good code. It makes the incorrect assumption of always having a SNP record available. + // However, that is not always the case and prevents the merging of G>GT,T with G>GT (see test/merge.multiallelics.1.*.vcf). + // We'd need to first check if it is possible to merge with something at all, and only then start excluding. + // Anyway, the can_merge() function should be about a *possibility*, one might argue that the priority should be handled in + // the stage_line() function. + // Commenting this out makes only one difference in our test case: reorders the output lines so that indels can come first. + // + // else + // { + // // First time here, choosing the first line: prioritize SNPs when available in the -m snps,both modes + // if ( (args->collapse&COLLAPSE_SNPS || args->collapse==COLLAPSE_NONE) // asked to merge SNVs into multiallelics + // && (maux->var_types&snp_mask) // there are SNVs at the current position + // && !(buf->rec[j].var_types&(snp_mask|ref_mask)) // and this record is not a SNV nor ref + // ) continue; + // } + + selected.types |= line_types; buf->rec[j].skip = 0; // the j-th record from i-th reader can be included. Final decision will be made in stage_line maux_update_alleles(args, i, j); @@ -3182,7 +3248,8 @@ void stage_line(args_t *args) if ( buf->rec[j].skip ) { int is_gvcf = maux->gvcf && maux->gvcf[i].active ? 1 : 0; - if ( !is_gvcf && is_gvcf_block(buf->lines[j]) ) is_gvcf = 1; + if ( !is_gvcf && is_gvcf_block(args,buf->lines[j]) ) is_gvcf = 1; + if ( is_gvcf && buf->rec[j].skip && !maux->gvcf[i].active ) continue; if ( !is_gvcf ) continue; // done or not compatible } if ( args->merge_by_id ) break; // if merging by ID and the line is compatible, the this is THE line @@ -3373,6 +3440,7 @@ void merge_vcf(args_t *args) if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno)); if ( args->n_threads ) hts_set_opt(args->out_fh, HTS_OPT_THREAD_POOL, args->files->p); //hts_set_threads(args->out_fh, args->n_threads); args->out_hdr = bcf_hdr_init("w"); + bcf_hdr_set_version(args->out_hdr, bcf_hdr_get_version(args->files->readers[0].header)); if ( args->header_fname ) { @@ -3394,7 +3462,6 @@ void merge_vcf(args_t *args) info_rules_init(args); missing_rules_init(args); - bcf_hdr_set_version(args->out_hdr, bcf_hdr_get_version(args->files->readers[0].header)); if ( bcf_hdr_write(args->out_fh, args->out_hdr)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); if ( args->header_only ) { @@ -3490,6 +3557,7 @@ static void usage(void) fprintf(bcftools_stderr, " -R, --regions-file FILE Restrict to regions listed in a file\n"); fprintf(bcftools_stderr, " --regions-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1]\n"); fprintf(bcftools_stderr, " --threads INT Use multithreading with INT worker threads [0]\n"); + fprintf(bcftools_stderr, " -v, --verbosity INT Verbosity level\n"); fprintf(bcftools_stderr, " -W, --write-index[=FMT] Automatically index the output files [off]\n"); fprintf(bcftools_stderr, "\n"); bcftools_exit(1); @@ -3536,11 +3604,15 @@ int main_vcfmerge(int argc, char *argv[]) {"force-single",no_argument,NULL,12}, {"filter-logic",required_argument,NULL,'F'}, {"write-index",optional_argument,NULL,'W'}, + {"verbosity",required_argument,NULL,'v'}, {NULL,0,NULL,0} }; char *tmp; - while ((c = getopt_long(argc, argv, "hm:f:r:R:o:O:i:M:l:g:F:0L:W::",loptions,NULL)) >= 0) { + while ((c = getopt_long(argc, argv, "hm:f:r:R:o:O:i:M:l:g:F:0L:W::v:",loptions,NULL)) >= 0) { switch (c) { + case 'v': + if ( apply_verbosity(optarg) < 0 ) error("Could not parse argument: --verbosity %s\n", optarg); + break; case 'L': args->local_alleles = strtol(optarg,&tmp,10); if ( *tmp ) error("Could not parse argument: --local-alleles %s\n", optarg); diff --git a/bcftools/vcfnorm.c b/bcftools/vcfnorm.c index f4725338..cabbebfb 100644 --- a/bcftools/vcfnorm.c +++ b/bcftools/vcfnorm.c @@ -1,6 +1,6 @@ /* vcfnorm.c -- Left-align and normalize indels. - Copyright (C) 2013-2024 Genome Research Ltd. + Copyright (C) 2013-2025 Genome Research Ltd. Author: Petr Danecek @@ -93,7 +93,7 @@ typedef struct int32_t *int32_arr; int ntmp_arr1, ntmp_arr2, nint32_arr; kstring_t *tmp_str; - kstring_t *tmp_als, *tmp_sym, tmp_kstr; + kstring_t *tmp_als, *tmp_sym, tmp_kstr, old_rec_tag_kstr; int ntmp_als, ntmp_sym; rbuf_t rbuf; int buf_win; // maximum distance between two records to consider @@ -105,7 +105,7 @@ typedef struct struct { int tot, set, swap; } nref; char **argv, *output_fname, *ref_fname, *vcf_fname, *region, *targets; int argc, rmdup, output_type, n_threads, check_ref, strict_filter, do_indels, clevel; - int nchanged, nskipped, nsplit, njoined, ntotal, nfilter, mrows_op, mrows_collapse, parsimonious; + int nchanged, nskipped, nsplit, njoined, ntotal, nfilter, nrmdup, mrows_op, mrows_collapse, parsimonious; int record_cmd_line, force, force_warned, keep_sum_ad; abuf_t *abuf; abuf_opt_t atomize; @@ -113,7 +113,7 @@ typedef struct char *old_rec_tag; htsFile *out; char *index_fn; - int write_index, gff_verbosity; + int write_index, verbose; int right_align; char *gff_fname; gff_t *gff; @@ -127,6 +127,42 @@ typedef struct } args_t; +static void old_rec_tag_init(args_t *args, bcf1_t *line) +{ + if ( !args->old_rec_tag ) return; + + args->old_rec_tag_kstr.l = 0; + ksprintf(&args->old_rec_tag_kstr,"%s|%"PRIhts_pos"|%s|",bcf_seqname(args->hdr,line),line->pos+1,line->d.allele[0]); + int i; + for (i=1; in_allele; i++) + { + kputs(line->d.allele[i],&args->old_rec_tag_kstr); + if ( i+1n_allele ) kputc(',',&args->old_rec_tag_kstr); + } +} +static void old_rec_tag_set(args_t *args, bcf1_t *line, int ialt) +{ + if ( !args->old_rec_tag || !args->old_rec_tag_kstr.l ) return; + + // only update if the tag is not present already, there can be multiple normalization steps + int i, id = bcf_hdr_id2int(args->out_hdr, BCF_DT_ID, args->old_rec_tag); + bcf_unpack(line, BCF_UN_INFO); + for (i=0; in_info; i++) + { + bcf_info_t *inf = &line->d.info[i]; + if ( inf && inf->key == id ) return; + } + + if ( ialt>0 ) + { + kputc('|',&args->old_rec_tag_kstr); + kputw(ialt,&args->old_rec_tag_kstr); + } + if ( (bcf_update_info_string(args->out_hdr, line, args->old_rec_tag, args->old_rec_tag_kstr.s))!=0 ) + error("An error occurred while updating INFO/%s\n",args->old_rec_tag); + args->old_rec_tag_kstr.l = 0; +} + static inline int replace_iupac_codes(char *seq, int nseq) { // Replace ambiguity codes with N for now, it awaits to be seen what the VCF spec codifies in the end @@ -159,7 +195,8 @@ static void seq_to_upper(char *seq, int len) for (i=0; seq[i]; i++) seq[i] = nt_to_upper(seq[i]); } -static void fix_ref(args_t *args, bcf1_t *line) +// returns 0 when no fix was needed, 1 otherwise +static int fix_ref(args_t *args, bcf1_t *line) { bcf_unpack(line, BCF_UN_STR); int reflen = strlen(line->d.allele[0]); @@ -177,7 +214,7 @@ static void fix_ref(args_t *args, bcf1_t *line) args->nref.tot++; // is the REF different? If not, we are done - if ( !strncasecmp(line->d.allele[0],ref,reflen) ) { free(ref); return; } + if ( !strncasecmp(line->d.allele[0],ref,reflen) ) { free(ref); return 0; } // is the REF allele missing? if ( reflen==1 && line->d.allele[0][0]=='.' ) @@ -186,11 +223,11 @@ static void fix_ref(args_t *args, bcf1_t *line) args->nref.set++; free(ref); bcf_update_alleles(args->out_hdr,line,(const char**)line->d.allele,line->n_allele); - return; + return 1; } // does REF or ALT contain non-standard bases? - int has_non_acgtn = 0; + int ret = 0, has_non_acgtn = 0; for (i=0; in_allele; i++) { if ( line->d.allele[i][0]=='<' ) continue; @@ -200,7 +237,8 @@ static void fix_ref(args_t *args, bcf1_t *line) { args->nref.set++; bcf_update_alleles(args->out_hdr,line,(const char**)line->d.allele,line->n_allele); - if ( !strncasecmp(line->d.allele[0],ref,reflen) ) { free(ref); return; } + if ( !strncasecmp(line->d.allele[0],ref,reflen) ) { free(ref); return 1; } + ret = 1; } // does the REF allele contain N's ? @@ -221,12 +259,12 @@ static void fix_ref(args_t *args, bcf1_t *line) } if ( fix ) { + ret = 1; args->nref.set++; bcf_update_alleles(args->out_hdr,line,(const char**)line->d.allele,line->n_allele); - if ( !strncasecmp(line->d.allele[0],ref,reflen) ) { free(ref); return; } + if ( !strncasecmp(line->d.allele[0],ref,reflen) ) { free(ref); return ret; } } - // is it swapped? for (i=1; in_allele; i++) { @@ -237,6 +275,7 @@ static void fix_ref(args_t *args, bcf1_t *line) kstring_t str = {0,0,0}; if ( i==line->n_allele ) // none of the alternate alleles matches the reference { + ret = 1; args->nref.set++; kputsn(ref,reflen,&str); for (i=1; in_allele; i++) @@ -247,7 +286,7 @@ static void fix_ref(args_t *args, bcf1_t *line) bcf_update_alleles_str(args->out_hdr,line,str.s); free(ref); free(str.s); - return; + return ret; } // one of the alternate alleles matches the reference, assume it's a simple swap @@ -289,6 +328,7 @@ static void fix_ref(args_t *args, bcf1_t *line) ac[i-1] = ni; bcf_update_info_int32(args->out_hdr, line, "AC", ac, nac); } + return 1; } static void fix_dup_alt(args_t *args, bcf1_t *line) @@ -334,41 +374,41 @@ static void fix_dup_alt(args_t *args, bcf1_t *line) if ( changed ) bcf_update_genotypes(args->out_hdr,line,gts,ngts); } -static void set_old_rec_tag(args_t *args, bcf1_t *dst, bcf1_t *src, int ialt) -{ - if ( !args->old_rec_tag ) return; - - // only update if the tag is not present already, there can be multiple normalization steps - int i, id = bcf_hdr_id2int(args->out_hdr, BCF_DT_ID, args->old_rec_tag); - bcf_unpack(dst, BCF_UN_INFO); - for (i=0; in_info; i++) - { - bcf_info_t *inf = &dst->d.info[i]; - if ( inf && inf->key == id ) return; - } - - args->tmp_kstr.l = 0; - ksprintf(&args->tmp_kstr,"%s|%"PRIhts_pos"|%s|",bcf_seqname(args->hdr,src),src->pos+1,src->d.allele[0]); - for (i=1; in_allele; i++) - { - kputs(src->d.allele[i],&args->tmp_kstr); - if ( i+1n_allele ) kputc(',',&args->tmp_kstr); - } - if ( ialt>0 ) - { - kputc('|',&args->tmp_kstr); - kputw(ialt,&args->tmp_kstr); - } - if ( (bcf_update_info_string(args->out_hdr, dst, args->old_rec_tag, args->tmp_kstr.s))!=0 ) - error("An error occurred while updating INFO/%s\n",args->old_rec_tag); -} +// static void set_old_rec_tag(args_t *args, bcf1_t *dst, bcf1_t *src, int ialt) +// { +// fprintf(stderr,"remove me\n"); +// if ( !args->old_rec_tag ) return; +// +// // only update if the tag is not present already, there can be multiple normalization steps +// int i, id = bcf_hdr_id2int(args->out_hdr, BCF_DT_ID, args->old_rec_tag); +// bcf_unpack(dst, BCF_UN_INFO); +// for (i=0; in_info; i++) +// { +// bcf_info_t *inf = &dst->d.info[i]; +// if ( inf && inf->key == id ) return; +// } +// +// args->tmp_kstr.l = 0; +// ksprintf(&args->tmp_kstr,"%s|%"PRIhts_pos"|%s|",bcf_seqname(args->hdr,src),src->pos+1,src->d.allele[0]); +// for (i=1; in_allele; i++) +// { +// kputs(src->d.allele[i],&args->tmp_kstr); +// if ( i+1n_allele ) kputc(',',&args->tmp_kstr); +// } +// if ( ialt>0 ) +// { +// kputc('|',&args->tmp_kstr); +// kputw(ialt,&args->tmp_kstr); +// } +// if ( (bcf_update_info_string(args->out_hdr, dst, args->old_rec_tag, args->tmp_kstr.s))!=0 ) +// error("An error occurred while updating INFO/%s\n",args->old_rec_tag); +// } static int is_left_align(args_t *args, bcf1_t *line) { if ( args->right_align ) return 0; if ( !args->gff ) return 1; const char *chr = bcf_seqname(args->hdr,line); - if ( !strncasecmp("chr",chr,3) ) chr += 3; // strip 'chr' prefix, that's what we requested the GFF reader to do if ( !regidx_overlap(args->idx_tscript,chr,line->pos,line->pos+line->rlen, args->itr_tscript) ) return 1; // if there are two conflicting overlapping transcripts, go with the default left-alignment @@ -523,6 +563,7 @@ static hts_pos_t realign_right(args_t *args, bcf1_t *line) static int realign(args_t *args, bcf1_t *line) { bcf_unpack(line, BCF_UN_STR); + old_rec_tag_init(args,line); // Sanity check REF int i, nref, reflen = strlen(line->d.allele[0]); @@ -655,7 +696,7 @@ static int realign(args_t *args, bcf1_t *line) } if ( new_pos==line->pos && !strcasecmp(line->d.allele[0],als[0].s) ) return ERR_OK; - set_old_rec_tag(args, line, line, 0); + old_rec_tag_set(args, line, 0); // Create new block of alleles and update args->tmp_kstr.l = 0; @@ -1247,6 +1288,7 @@ static void split_multiallelic_to_biallelics(args_t *args, bcf1_t *line) if ( !args->tmp_lines[i] ) args->tmp_lines[i] = bcf_init1(); bcf1_t *dst = args->tmp_lines[i]; bcf_clear(dst); + old_rec_tag_init(args,line); dst->rid = line->rid; dst->pos = line->pos; @@ -1271,7 +1313,7 @@ static void split_multiallelic_to_biallelics(args_t *args, bcf1_t *line) else if ( type==BCF_HT_FLAG ) split_info_flag(args, line, info, i, dst); else split_info_string(args, line, info, i, dst); } - set_old_rec_tag(args, dst, line, i + 1); // 1-based indexes + old_rec_tag_set(args, dst, i + 1); // 1-based indexes dst->n_sample = line->n_sample; for (j=0; jn_fmt; j++) @@ -2138,10 +2180,10 @@ static void flush_buffer(args_t *args, htsFile *file, int n) int line_type = bcf_get_variant_types(args->lines[k]); if ( prev_rid>=0 && prev_rid==args->lines[k]->rid && prev_pos==args->lines[k]->pos ) { - if ( args->rmdup & BCF_SR_PAIR_ANY ) continue; // rmdup by position only - if ( args->rmdup & BCF_SR_PAIR_SNPS && line_type&(VCF_SNP|VCF_MNP) && prev_type&(VCF_SNP|VCF_MNP) ) continue; - if ( args->rmdup & BCF_SR_PAIR_INDELS && line_type&(VCF_INDEL) && prev_type&(VCF_INDEL) ) continue; - if ( args->rmdup & BCF_SR_PAIR_EXACT && cmpals_match(args, &args->cmpals_out, args->lines[k]) ) continue; + if ( args->rmdup & BCF_SR_PAIR_ANY ) { args->nrmdup++; continue; } // rmdup by position only + if ( args->rmdup & BCF_SR_PAIR_SNPS && line_type&(VCF_SNP|VCF_MNP) && prev_type&(VCF_SNP|VCF_MNP) ) { args->nrmdup++; continue; } + if ( args->rmdup & BCF_SR_PAIR_INDELS && line_type&(VCF_INDEL) && prev_type&(VCF_INDEL) ) { args->nrmdup++; continue; } + if ( args->rmdup & BCF_SR_PAIR_EXACT && cmpals_match(args, &args->cmpals_out, args->lines[k]) ) { args->nrmdup++; continue; } } else { @@ -2190,6 +2232,15 @@ static void init_data(args_t *args) args->tmp_str = (kstring_t*) calloc(bcf_hdr_nsamples(args->hdr),sizeof(kstring_t)); args->diploid = (uint8_t*) malloc(bcf_hdr_nsamples(args->hdr)); } + if ( args->mrows_op==MROWS_SPLIT ) + { + // check the sanity of splitted fields, specifically of SVLEN (#2371) + int id = bcf_hdr_id2int(args->hdr,BCF_DT_ID,"SVLEN"); + if ( id>=0 && bcf_hdr_id2length(args->hdr,BCF_HL_INFO,id)!=BCF_VL_A ) + fprintf(stderr, + "Warning: the tag INFO/SVLEN must be defined as Number=A in order for the field to be split\n" + " (the command `bcftools reheader` can be used to fix the header)\n"); + } if ( args->atomize==SPLIT ) { args->abuf = abuf_init(args->hdr, SPLIT); @@ -2204,8 +2255,7 @@ static void init_data(args_t *args) if ( args->gff_fname ) { args->gff = gff_init(args->gff_fname); - gff_set(args->gff,verbosity,args->gff_verbosity); - gff_set(args->gff,strip_chr_names,1); + gff_set(args->gff,verbosity,args->verbose); gff_parse(args->gff); args->idx_tscript = gff_get(args->gff,idx_tscript); args->itr_tscript = regitr_init(NULL); @@ -2246,6 +2296,7 @@ static void destroy_data(args_t *args) free(args->tmp_als); free(args->tmp_sym); free(args->tmp_kstr.s); + free(args->old_rec_tag_kstr.s); if ( args->tmp_str ) { for (i=0; ihdr); i++) free(args->tmp_str[i].s); @@ -2269,7 +2320,11 @@ static void normalize_line(args_t *args, bcf1_t *line) { if ( args->fai ) { - if ( args->filter_pass && (args->check_ref & CHECK_REF_FIX) ) fix_ref(args, line); + if ( args->filter_pass && (args->check_ref & CHECK_REF_FIX) ) + { + old_rec_tag_init(args,line); + if ( fix_ref(args,line) ) old_rec_tag_set(args,line,0); + } if ( args->do_indels ) { int ret = args->filter_pass ? realign(args, line) : ERR_OK; @@ -2425,8 +2480,8 @@ static void normalize_vcf(args_t *args) } if ( hts_close(args->out)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname); - fprintf(stderr,"Lines total/split/joined/realigned/removed/skipped:\t%d/%d/%d/%d/%d/%d\n", - args->ntotal,args->nsplit,args->njoined,args->nchanged,args->nskipped,args->nfilter); + fprintf(stderr,"Lines total/split/joined/realigned/mismatch_removed/dup_removed/skipped:\t%d/%d/%d/%d/%d/%d/%d\n", + args->ntotal,args->nsplit,args->njoined,args->nchanged,args->nskipped,args->nrmdup,args->nfilter); if ( args->check_ref & CHECK_REF_FIX ) fprintf(stderr,"REF/ALT total/modified/added: \t%d/%d/%d\n", args->nref.tot,args->nref.swap,args->nref.set); } @@ -2467,7 +2522,7 @@ static void usage(void) fprintf(stderr, " -T, --targets-file FILE Similar to -R but streams rather than index-jumps\n"); fprintf(stderr, " --targets-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n"); fprintf(stderr, " --threads INT Use multithreading with INT worker threads [0]\n"); - fprintf(stderr, " -v, --verbose INT Verbosity level (0-2) of GFF parsing [1]\n"); + fprintf(stderr, " -v, --verbosity INT Verbosity level\n"); fprintf(stderr, " -w, --site-win INT Buffer for sorting lines which changed position during realignment [1000]\n"); fprintf(stderr, " -W, --write-index[=FMT] Automatically index the output files [off]\n"); fprintf(stderr, "\n"); @@ -2500,7 +2555,7 @@ int main_vcfnorm(int argc, char *argv[]) int region_is_file = 0; int targets_is_file = 0; args->use_star_allele = 1; - args->gff_verbosity = 1; + args->verbose = 1; int regions_overlap = 1; int targets_overlap = 0; args->cmp_func = cmp_bcf_pos; @@ -2539,6 +2594,7 @@ int main_vcfnorm(int argc, char *argv[]) {"no-version",no_argument,NULL,8}, {"write-index",optional_argument,NULL,'W'}, {"verbose",required_argument,NULL,'v'}, + {"verbosity",required_argument,NULL,'v'}, {NULL,0,NULL,0} }; char *tmp; @@ -2552,8 +2608,9 @@ int main_vcfnorm(int argc, char *argv[]) break; case 'g': args->gff_fname = optarg; break; case 'v': - args->gff_verbosity = atoi(optarg); - if ( args->gff_verbosity<0 || args->gff_verbosity>2 ) error("Error: expected integer 0-2 with -v, --verbose\n"); + args->verbose = strtol(optarg,&tmp,10); + if ( *tmp || args->verbose<0 ) error("Could not parse argument: --verbosity %s\n", optarg); + if ( args->verbose > 3 ) hts_verbose = args->verbose; break; case 'a': args->atomize = SPLIT; break; case 'e': @@ -2633,7 +2690,7 @@ int main_vcfnorm(int argc, char *argv[]) break; case 'o': args->output_fname = optarg; break; case 'D': - fprintf(stderr,"Warning: `-D` is functional but deprecated, replaced by and alias of `-d none`.\n"); + fprintf(stderr,"Warning: `-D` is functional but deprecated, replaced by and alias of `-d exact`.\n"); args->rmdup = BCF_SR_PAIR_EXACT; break; case 's': args->strict_filter = 1; break; diff --git a/bcftools/vcfnorm.c.pysam.c b/bcftools/vcfnorm.c.pysam.c index 4fe92ec9..dc58bbbb 100644 --- a/bcftools/vcfnorm.c.pysam.c +++ b/bcftools/vcfnorm.c.pysam.c @@ -2,7 +2,7 @@ /* vcfnorm.c -- Left-align and normalize indels. - Copyright (C) 2013-2024 Genome Research Ltd. + Copyright (C) 2013-2025 Genome Research Ltd. Author: Petr Danecek @@ -95,7 +95,7 @@ typedef struct int32_t *int32_arr; int ntmp_arr1, ntmp_arr2, nint32_arr; kstring_t *tmp_str; - kstring_t *tmp_als, *tmp_sym, tmp_kstr; + kstring_t *tmp_als, *tmp_sym, tmp_kstr, old_rec_tag_kstr; int ntmp_als, ntmp_sym; rbuf_t rbuf; int buf_win; // maximum distance between two records to consider @@ -107,7 +107,7 @@ typedef struct struct { int tot, set, swap; } nref; char **argv, *output_fname, *ref_fname, *vcf_fname, *region, *targets; int argc, rmdup, output_type, n_threads, check_ref, strict_filter, do_indels, clevel; - int nchanged, nskipped, nsplit, njoined, ntotal, nfilter, mrows_op, mrows_collapse, parsimonious; + int nchanged, nskipped, nsplit, njoined, ntotal, nfilter, nrmdup, mrows_op, mrows_collapse, parsimonious; int record_cmd_line, force, force_warned, keep_sum_ad; abuf_t *abuf; abuf_opt_t atomize; @@ -115,7 +115,7 @@ typedef struct char *old_rec_tag; htsFile *out; char *index_fn; - int write_index, gff_verbosity; + int write_index, verbose; int right_align; char *gff_fname; gff_t *gff; @@ -129,6 +129,42 @@ typedef struct } args_t; +static void old_rec_tag_init(args_t *args, bcf1_t *line) +{ + if ( !args->old_rec_tag ) return; + + args->old_rec_tag_kstr.l = 0; + ksprintf(&args->old_rec_tag_kstr,"%s|%"PRIhts_pos"|%s|",bcf_seqname(args->hdr,line),line->pos+1,line->d.allele[0]); + int i; + for (i=1; in_allele; i++) + { + kputs(line->d.allele[i],&args->old_rec_tag_kstr); + if ( i+1n_allele ) kputc(',',&args->old_rec_tag_kstr); + } +} +static void old_rec_tag_set(args_t *args, bcf1_t *line, int ialt) +{ + if ( !args->old_rec_tag || !args->old_rec_tag_kstr.l ) return; + + // only update if the tag is not present already, there can be multiple normalization steps + int i, id = bcf_hdr_id2int(args->out_hdr, BCF_DT_ID, args->old_rec_tag); + bcf_unpack(line, BCF_UN_INFO); + for (i=0; in_info; i++) + { + bcf_info_t *inf = &line->d.info[i]; + if ( inf && inf->key == id ) return; + } + + if ( ialt>0 ) + { + kputc('|',&args->old_rec_tag_kstr); + kputw(ialt,&args->old_rec_tag_kstr); + } + if ( (bcf_update_info_string(args->out_hdr, line, args->old_rec_tag, args->old_rec_tag_kstr.s))!=0 ) + error("An error occurred while updating INFO/%s\n",args->old_rec_tag); + args->old_rec_tag_kstr.l = 0; +} + static inline int replace_iupac_codes(char *seq, int nseq) { // Replace ambiguity codes with N for now, it awaits to be seen what the VCF spec codifies in the end @@ -161,7 +197,8 @@ static void seq_to_upper(char *seq, int len) for (i=0; seq[i]; i++) seq[i] = nt_to_upper(seq[i]); } -static void fix_ref(args_t *args, bcf1_t *line) +// returns 0 when no fix was needed, 1 otherwise +static int fix_ref(args_t *args, bcf1_t *line) { bcf_unpack(line, BCF_UN_STR); int reflen = strlen(line->d.allele[0]); @@ -179,7 +216,7 @@ static void fix_ref(args_t *args, bcf1_t *line) args->nref.tot++; // is the REF different? If not, we are done - if ( !strncasecmp(line->d.allele[0],ref,reflen) ) { free(ref); return; } + if ( !strncasecmp(line->d.allele[0],ref,reflen) ) { free(ref); return 0; } // is the REF allele missing? if ( reflen==1 && line->d.allele[0][0]=='.' ) @@ -188,11 +225,11 @@ static void fix_ref(args_t *args, bcf1_t *line) args->nref.set++; free(ref); bcf_update_alleles(args->out_hdr,line,(const char**)line->d.allele,line->n_allele); - return; + return 1; } // does REF or ALT contain non-standard bases? - int has_non_acgtn = 0; + int ret = 0, has_non_acgtn = 0; for (i=0; in_allele; i++) { if ( line->d.allele[i][0]=='<' ) continue; @@ -202,7 +239,8 @@ static void fix_ref(args_t *args, bcf1_t *line) { args->nref.set++; bcf_update_alleles(args->out_hdr,line,(const char**)line->d.allele,line->n_allele); - if ( !strncasecmp(line->d.allele[0],ref,reflen) ) { free(ref); return; } + if ( !strncasecmp(line->d.allele[0],ref,reflen) ) { free(ref); return 1; } + ret = 1; } // does the REF allele contain N's ? @@ -223,12 +261,12 @@ static void fix_ref(args_t *args, bcf1_t *line) } if ( fix ) { + ret = 1; args->nref.set++; bcf_update_alleles(args->out_hdr,line,(const char**)line->d.allele,line->n_allele); - if ( !strncasecmp(line->d.allele[0],ref,reflen) ) { free(ref); return; } + if ( !strncasecmp(line->d.allele[0],ref,reflen) ) { free(ref); return ret; } } - // is it swapped? for (i=1; in_allele; i++) { @@ -239,6 +277,7 @@ static void fix_ref(args_t *args, bcf1_t *line) kstring_t str = {0,0,0}; if ( i==line->n_allele ) // none of the alternate alleles matches the reference { + ret = 1; args->nref.set++; kputsn(ref,reflen,&str); for (i=1; in_allele; i++) @@ -249,7 +288,7 @@ static void fix_ref(args_t *args, bcf1_t *line) bcf_update_alleles_str(args->out_hdr,line,str.s); free(ref); free(str.s); - return; + return ret; } // one of the alternate alleles matches the reference, assume it's a simple swap @@ -291,6 +330,7 @@ static void fix_ref(args_t *args, bcf1_t *line) ac[i-1] = ni; bcf_update_info_int32(args->out_hdr, line, "AC", ac, nac); } + return 1; } static void fix_dup_alt(args_t *args, bcf1_t *line) @@ -336,41 +376,41 @@ static void fix_dup_alt(args_t *args, bcf1_t *line) if ( changed ) bcf_update_genotypes(args->out_hdr,line,gts,ngts); } -static void set_old_rec_tag(args_t *args, bcf1_t *dst, bcf1_t *src, int ialt) -{ - if ( !args->old_rec_tag ) return; - - // only update if the tag is not present already, there can be multiple normalization steps - int i, id = bcf_hdr_id2int(args->out_hdr, BCF_DT_ID, args->old_rec_tag); - bcf_unpack(dst, BCF_UN_INFO); - for (i=0; in_info; i++) - { - bcf_info_t *inf = &dst->d.info[i]; - if ( inf && inf->key == id ) return; - } - - args->tmp_kstr.l = 0; - ksprintf(&args->tmp_kstr,"%s|%"PRIhts_pos"|%s|",bcf_seqname(args->hdr,src),src->pos+1,src->d.allele[0]); - for (i=1; in_allele; i++) - { - kputs(src->d.allele[i],&args->tmp_kstr); - if ( i+1n_allele ) kputc(',',&args->tmp_kstr); - } - if ( ialt>0 ) - { - kputc('|',&args->tmp_kstr); - kputw(ialt,&args->tmp_kstr); - } - if ( (bcf_update_info_string(args->out_hdr, dst, args->old_rec_tag, args->tmp_kstr.s))!=0 ) - error("An error occurred while updating INFO/%s\n",args->old_rec_tag); -} +// static void set_old_rec_tag(args_t *args, bcf1_t *dst, bcf1_t *src, int ialt) +// { +// fprintf(bcftools_stderr,"remove me\n"); +// if ( !args->old_rec_tag ) return; +// +// // only update if the tag is not present already, there can be multiple normalization steps +// int i, id = bcf_hdr_id2int(args->out_hdr, BCF_DT_ID, args->old_rec_tag); +// bcf_unpack(dst, BCF_UN_INFO); +// for (i=0; in_info; i++) +// { +// bcf_info_t *inf = &dst->d.info[i]; +// if ( inf && inf->key == id ) return; +// } +// +// args->tmp_kstr.l = 0; +// ksprintf(&args->tmp_kstr,"%s|%"PRIhts_pos"|%s|",bcf_seqname(args->hdr,src),src->pos+1,src->d.allele[0]); +// for (i=1; in_allele; i++) +// { +// kputs(src->d.allele[i],&args->tmp_kstr); +// if ( i+1n_allele ) kputc(',',&args->tmp_kstr); +// } +// if ( ialt>0 ) +// { +// kputc('|',&args->tmp_kstr); +// kputw(ialt,&args->tmp_kstr); +// } +// if ( (bcf_update_info_string(args->out_hdr, dst, args->old_rec_tag, args->tmp_kstr.s))!=0 ) +// error("An error occurred while updating INFO/%s\n",args->old_rec_tag); +// } static int is_left_align(args_t *args, bcf1_t *line) { if ( args->right_align ) return 0; if ( !args->gff ) return 1; const char *chr = bcf_seqname(args->hdr,line); - if ( !strncasecmp("chr",chr,3) ) chr += 3; // strip 'chr' prefix, that's what we requested the GFF reader to do if ( !regidx_overlap(args->idx_tscript,chr,line->pos,line->pos+line->rlen, args->itr_tscript) ) return 1; // if there are two conflicting overlapping transcripts, go with the default left-alignment @@ -525,6 +565,7 @@ static hts_pos_t realign_right(args_t *args, bcf1_t *line) static int realign(args_t *args, bcf1_t *line) { bcf_unpack(line, BCF_UN_STR); + old_rec_tag_init(args,line); // Sanity check REF int i, nref, reflen = strlen(line->d.allele[0]); @@ -657,7 +698,7 @@ static int realign(args_t *args, bcf1_t *line) } if ( new_pos==line->pos && !strcasecmp(line->d.allele[0],als[0].s) ) return ERR_OK; - set_old_rec_tag(args, line, line, 0); + old_rec_tag_set(args, line, 0); // Create new block of alleles and update args->tmp_kstr.l = 0; @@ -1249,6 +1290,7 @@ static void split_multiallelic_to_biallelics(args_t *args, bcf1_t *line) if ( !args->tmp_lines[i] ) args->tmp_lines[i] = bcf_init1(); bcf1_t *dst = args->tmp_lines[i]; bcf_clear(dst); + old_rec_tag_init(args,line); dst->rid = line->rid; dst->pos = line->pos; @@ -1273,7 +1315,7 @@ static void split_multiallelic_to_biallelics(args_t *args, bcf1_t *line) else if ( type==BCF_HT_FLAG ) split_info_flag(args, line, info, i, dst); else split_info_string(args, line, info, i, dst); } - set_old_rec_tag(args, dst, line, i + 1); // 1-based indexes + old_rec_tag_set(args, dst, i + 1); // 1-based indexes dst->n_sample = line->n_sample; for (j=0; jn_fmt; j++) @@ -2140,10 +2182,10 @@ static void flush_buffer(args_t *args, htsFile *file, int n) int line_type = bcf_get_variant_types(args->lines[k]); if ( prev_rid>=0 && prev_rid==args->lines[k]->rid && prev_pos==args->lines[k]->pos ) { - if ( args->rmdup & BCF_SR_PAIR_ANY ) continue; // rmdup by position only - if ( args->rmdup & BCF_SR_PAIR_SNPS && line_type&(VCF_SNP|VCF_MNP) && prev_type&(VCF_SNP|VCF_MNP) ) continue; - if ( args->rmdup & BCF_SR_PAIR_INDELS && line_type&(VCF_INDEL) && prev_type&(VCF_INDEL) ) continue; - if ( args->rmdup & BCF_SR_PAIR_EXACT && cmpals_match(args, &args->cmpals_out, args->lines[k]) ) continue; + if ( args->rmdup & BCF_SR_PAIR_ANY ) { args->nrmdup++; continue; } // rmdup by position only + if ( args->rmdup & BCF_SR_PAIR_SNPS && line_type&(VCF_SNP|VCF_MNP) && prev_type&(VCF_SNP|VCF_MNP) ) { args->nrmdup++; continue; } + if ( args->rmdup & BCF_SR_PAIR_INDELS && line_type&(VCF_INDEL) && prev_type&(VCF_INDEL) ) { args->nrmdup++; continue; } + if ( args->rmdup & BCF_SR_PAIR_EXACT && cmpals_match(args, &args->cmpals_out, args->lines[k]) ) { args->nrmdup++; continue; } } else { @@ -2192,6 +2234,15 @@ static void init_data(args_t *args) args->tmp_str = (kstring_t*) calloc(bcf_hdr_nsamples(args->hdr),sizeof(kstring_t)); args->diploid = (uint8_t*) malloc(bcf_hdr_nsamples(args->hdr)); } + if ( args->mrows_op==MROWS_SPLIT ) + { + // check the sanity of splitted fields, specifically of SVLEN (#2371) + int id = bcf_hdr_id2int(args->hdr,BCF_DT_ID,"SVLEN"); + if ( id>=0 && bcf_hdr_id2length(args->hdr,BCF_HL_INFO,id)!=BCF_VL_A ) + fprintf(bcftools_stderr, + "Warning: the tag INFO/SVLEN must be defined as Number=A in order for the field to be split\n" + " (the command `bcftools reheader` can be used to fix the header)\n"); + } if ( args->atomize==SPLIT ) { args->abuf = abuf_init(args->hdr, SPLIT); @@ -2206,8 +2257,7 @@ static void init_data(args_t *args) if ( args->gff_fname ) { args->gff = gff_init(args->gff_fname); - gff_set(args->gff,verbosity,args->gff_verbosity); - gff_set(args->gff,strip_chr_names,1); + gff_set(args->gff,verbosity,args->verbose); gff_parse(args->gff); args->idx_tscript = gff_get(args->gff,idx_tscript); args->itr_tscript = regitr_init(NULL); @@ -2248,6 +2298,7 @@ static void destroy_data(args_t *args) free(args->tmp_als); free(args->tmp_sym); free(args->tmp_kstr.s); + free(args->old_rec_tag_kstr.s); if ( args->tmp_str ) { for (i=0; ihdr); i++) free(args->tmp_str[i].s); @@ -2271,7 +2322,11 @@ static void normalize_line(args_t *args, bcf1_t *line) { if ( args->fai ) { - if ( args->filter_pass && (args->check_ref & CHECK_REF_FIX) ) fix_ref(args, line); + if ( args->filter_pass && (args->check_ref & CHECK_REF_FIX) ) + { + old_rec_tag_init(args,line); + if ( fix_ref(args,line) ) old_rec_tag_set(args,line,0); + } if ( args->do_indels ) { int ret = args->filter_pass ? realign(args, line) : ERR_OK; @@ -2427,8 +2482,8 @@ static void normalize_vcf(args_t *args) } if ( hts_close(args->out)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname); - fprintf(bcftools_stderr,"Lines total/split/joined/realigned/removed/skipped:\t%d/%d/%d/%d/%d/%d\n", - args->ntotal,args->nsplit,args->njoined,args->nchanged,args->nskipped,args->nfilter); + fprintf(bcftools_stderr,"Lines total/split/joined/realigned/mismatch_removed/dup_removed/skipped:\t%d/%d/%d/%d/%d/%d/%d\n", + args->ntotal,args->nsplit,args->njoined,args->nchanged,args->nskipped,args->nrmdup,args->nfilter); if ( args->check_ref & CHECK_REF_FIX ) fprintf(bcftools_stderr,"REF/ALT total/modified/added: \t%d/%d/%d\n", args->nref.tot,args->nref.swap,args->nref.set); } @@ -2469,7 +2524,7 @@ static void usage(void) fprintf(bcftools_stderr, " -T, --targets-file FILE Similar to -R but streams rather than index-jumps\n"); fprintf(bcftools_stderr, " --targets-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n"); fprintf(bcftools_stderr, " --threads INT Use multithreading with INT worker threads [0]\n"); - fprintf(bcftools_stderr, " -v, --verbose INT Verbosity level (0-2) of GFF parsing [1]\n"); + fprintf(bcftools_stderr, " -v, --verbosity INT Verbosity level\n"); fprintf(bcftools_stderr, " -w, --site-win INT Buffer for sorting lines which changed position during realignment [1000]\n"); fprintf(bcftools_stderr, " -W, --write-index[=FMT] Automatically index the output files [off]\n"); fprintf(bcftools_stderr, "\n"); @@ -2502,7 +2557,7 @@ int main_vcfnorm(int argc, char *argv[]) int region_is_file = 0; int targets_is_file = 0; args->use_star_allele = 1; - args->gff_verbosity = 1; + args->verbose = 1; int regions_overlap = 1; int targets_overlap = 0; args->cmp_func = cmp_bcf_pos; @@ -2541,6 +2596,7 @@ int main_vcfnorm(int argc, char *argv[]) {"no-version",no_argument,NULL,8}, {"write-index",optional_argument,NULL,'W'}, {"verbose",required_argument,NULL,'v'}, + {"verbosity",required_argument,NULL,'v'}, {NULL,0,NULL,0} }; char *tmp; @@ -2554,8 +2610,9 @@ int main_vcfnorm(int argc, char *argv[]) break; case 'g': args->gff_fname = optarg; break; case 'v': - args->gff_verbosity = atoi(optarg); - if ( args->gff_verbosity<0 || args->gff_verbosity>2 ) error("Error: expected integer 0-2 with -v, --verbose\n"); + args->verbose = strtol(optarg,&tmp,10); + if ( *tmp || args->verbose<0 ) error("Could not parse argument: --verbosity %s\n", optarg); + if ( args->verbose > 3 ) hts_verbose = args->verbose; break; case 'a': args->atomize = SPLIT; break; case 'e': @@ -2635,7 +2692,7 @@ int main_vcfnorm(int argc, char *argv[]) break; case 'o': args->output_fname = optarg; break; case 'D': - fprintf(bcftools_stderr,"Warning: `-D` is functional but deprecated, replaced by and alias of `-d none`.\n"); + fprintf(bcftools_stderr,"Warning: `-D` is functional but deprecated, replaced by and alias of `-d exact`.\n"); args->rmdup = BCF_SR_PAIR_EXACT; break; case 's': args->strict_filter = 1; break; diff --git a/bcftools/vcfplugin.c b/bcftools/vcfplugin.c index 4ee99ee1..b3cbcbbf 100644 --- a/bcftools/vcfplugin.c +++ b/bcftools/vcfplugin.c @@ -1,6 +1,6 @@ /* vcfplugin.c -- plugin modules for operating on VCF/BCF files. - Copyright (C) 2013-2023 Genome Research Ltd. + Copyright (C) 2013-2025 Genome Research Ltd. Author: Petr Danecek @@ -613,7 +613,7 @@ static void usage(args_t *args) fprintf(stderr, "Plugin options:\n"); fprintf(stderr, " -h, --help List plugin's options\n"); fprintf(stderr, " -l, --list-plugins List available plugins. See BCFTOOLS_PLUGINS environment variable and man page for details\n"); - fprintf(stderr, " -v, --verbose Print verbose information, -vv increases verbosity\n"); + fprintf(stderr, " -v, --verbosity INT Verbosity level\n"); fprintf(stderr, " -V, --version Print version string and exit\n"); fprintf(stderr, " -W, --write-index[=FMT] Automatically index the output files [off]\n"); fprintf(stderr, "\n"); @@ -678,7 +678,8 @@ int main_plugin(int argc, char *argv[]) static struct option loptions[] = { {"version",no_argument,NULL,'V'}, - {"verbose",no_argument,NULL,'v'}, + {"verbose",optional_argument,NULL,'v'}, + {"verbosity",optional_argument,NULL,'v'}, {"help",no_argument,NULL,'h'}, {"list-plugins",no_argument,NULL,'l'}, {"output",required_argument,NULL,'o'}, @@ -697,11 +698,18 @@ int main_plugin(int argc, char *argv[]) {NULL,0,NULL,0} }; char *tmp; - while ((c = getopt_long(argc, argv, "h?o:O:r:R:t:T:li:e:vVW::",loptions,NULL)) >= 0) + while ((c = getopt_long(argc, argv, "h?o:O:r:R:t:T:li:e:v::VW::",loptions,NULL)) >= 0) { switch (c) { case 'V': version_only = 1; break; - case 'v': args->verbose++; break; + case 'v': + if ( !optarg ) args->verbose++; + else + { + args->verbose = strtol(optarg,&tmp,10); + if ( *tmp || args->verbose<0 ) error("Could not parse argument: --verbosity %s\n", optarg); + if ( args->verbose > 3 ) hts_verbose = args->verbose; + } case 'o': args->output_fname = optarg; break; case 'O': switch (optarg[0]) { diff --git a/bcftools/vcfplugin.c.pysam.c b/bcftools/vcfplugin.c.pysam.c index f19bc963..024438a9 100644 --- a/bcftools/vcfplugin.c.pysam.c +++ b/bcftools/vcfplugin.c.pysam.c @@ -2,7 +2,7 @@ /* vcfplugin.c -- plugin modules for operating on VCF/BCF files. - Copyright (C) 2013-2023 Genome Research Ltd. + Copyright (C) 2013-2025 Genome Research Ltd. Author: Petr Danecek @@ -615,7 +615,7 @@ static void usage(args_t *args) fprintf(bcftools_stderr, "Plugin options:\n"); fprintf(bcftools_stderr, " -h, --help List plugin's options\n"); fprintf(bcftools_stderr, " -l, --list-plugins List available plugins. See BCFTOOLS_PLUGINS environment variable and man page for details\n"); - fprintf(bcftools_stderr, " -v, --verbose Print verbose information, -vv increases verbosity\n"); + fprintf(bcftools_stderr, " -v, --verbosity INT Verbosity level\n"); fprintf(bcftools_stderr, " -V, --version Print version string and exit\n"); fprintf(bcftools_stderr, " -W, --write-index[=FMT] Automatically index the output files [off]\n"); fprintf(bcftools_stderr, "\n"); @@ -680,7 +680,8 @@ int main_plugin(int argc, char *argv[]) static struct option loptions[] = { {"version",no_argument,NULL,'V'}, - {"verbose",no_argument,NULL,'v'}, + {"verbose",optional_argument,NULL,'v'}, + {"verbosity",optional_argument,NULL,'v'}, {"help",no_argument,NULL,'h'}, {"list-plugins",no_argument,NULL,'l'}, {"output",required_argument,NULL,'o'}, @@ -699,11 +700,18 @@ int main_plugin(int argc, char *argv[]) {NULL,0,NULL,0} }; char *tmp; - while ((c = getopt_long(argc, argv, "h?o:O:r:R:t:T:li:e:vVW::",loptions,NULL)) >= 0) + while ((c = getopt_long(argc, argv, "h?o:O:r:R:t:T:li:e:v::VW::",loptions,NULL)) >= 0) { switch (c) { case 'V': version_only = 1; break; - case 'v': args->verbose++; break; + case 'v': + if ( !optarg ) args->verbose++; + else + { + args->verbose = strtol(optarg,&tmp,10); + if ( *tmp || args->verbose<0 ) error("Could not parse argument: --verbosity %s\n", optarg); + if ( args->verbose > 3 ) hts_verbose = args->verbose; + } case 'o': args->output_fname = optarg; break; case 'O': switch (optarg[0]) { diff --git a/bcftools/vcfquery.c b/bcftools/vcfquery.c index 7b1dd439..eefb2a2d 100644 --- a/bcftools/vcfquery.c +++ b/bcftools/vcfquery.c @@ -1,6 +1,6 @@ /* vcfquery.c -- Extracts fields from VCF/BCF file. - Copyright (C) 2013-2023 Genome Research Ltd. + Copyright (C) 2013-2025 Genome Research Ltd. Author: Petr Danecek @@ -254,6 +254,7 @@ static void usage(void) fprintf(stderr, " --targets-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n"); fprintf(stderr, " -u, --allow-undef-tags Print \".\" for undefined tags\n"); fprintf(stderr, " -v, --vcf-list FILE Process multiple VCFs listed in the file\n"); + fprintf(stderr, " --verbosity INT Verbosity level\n"); fprintf(stderr, "\n"); fprintf(stderr, "Examples:\n"); fprintf(stderr, "\tbcftools query -f '%%CHROM\\t%%POS\\t%%REF\\t%%ALT[\\t%%SAMPLE=%%GT]\\n' file.vcf.gz\n"); @@ -297,6 +298,7 @@ int main_vcfquery(int argc, char *argv[]) {"collapse",1,0,'c'}, {"vcf-list",1,0,'v'}, {"allow-undef-tags",0,0,'u'}, + {"verbosity",required_argument,NULL,4}, {0,0,0,0} }; while ((c = getopt_long(argc, argv, "hlr:R:F:f:a:s:S:Ht:T:c:v:i:e:o:uN",loptions,NULL)) >= 0) { @@ -350,6 +352,9 @@ int main_vcfquery(int argc, char *argv[]) if ( targets_overlap < 0 ) error("Could not parse: --targets-overlap %s\n",optarg); break; case 3 : args->force_samples = 1; break; + case 4 : + if ( apply_verbosity(optarg) < 0 ) error("Could not parse argument: --verbosity %s\n", optarg); + break; case 'h': case '?': usage(); break; default: error("Unknown argument: %s\n", optarg); diff --git a/bcftools/vcfquery.c.pysam.c b/bcftools/vcfquery.c.pysam.c index 407d2562..818e8d42 100644 --- a/bcftools/vcfquery.c.pysam.c +++ b/bcftools/vcfquery.c.pysam.c @@ -2,7 +2,7 @@ /* vcfquery.c -- Extracts fields from VCF/BCF file. - Copyright (C) 2013-2023 Genome Research Ltd. + Copyright (C) 2013-2025 Genome Research Ltd. Author: Petr Danecek @@ -256,6 +256,7 @@ static void usage(void) fprintf(bcftools_stderr, " --targets-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n"); fprintf(bcftools_stderr, " -u, --allow-undef-tags Print \".\" for undefined tags\n"); fprintf(bcftools_stderr, " -v, --vcf-list FILE Process multiple VCFs listed in the file\n"); + fprintf(bcftools_stderr, " --verbosity INT Verbosity level\n"); fprintf(bcftools_stderr, "\n"); fprintf(bcftools_stderr, "Examples:\n"); fprintf(bcftools_stderr, "\tbcftools query -f '%%CHROM\\t%%POS\\t%%REF\\t%%ALT[\\t%%SAMPLE=%%GT]\\n' file.vcf.gz\n"); @@ -299,6 +300,7 @@ int main_vcfquery(int argc, char *argv[]) {"collapse",1,0,'c'}, {"vcf-list",1,0,'v'}, {"allow-undef-tags",0,0,'u'}, + {"verbosity",required_argument,NULL,4}, {0,0,0,0} }; while ((c = getopt_long(argc, argv, "hlr:R:F:f:a:s:S:Ht:T:c:v:i:e:o:uN",loptions,NULL)) >= 0) { @@ -352,6 +354,9 @@ int main_vcfquery(int argc, char *argv[]) if ( targets_overlap < 0 ) error("Could not parse: --targets-overlap %s\n",optarg); break; case 3 : args->force_samples = 1; break; + case 4 : + if ( apply_verbosity(optarg) < 0 ) error("Could not parse argument: --verbosity %s\n", optarg); + break; case 'h': case '?': usage(); break; default: error("Unknown argument: %s\n", optarg); diff --git a/bcftools/vcfroh.c b/bcftools/vcfroh.c index f1d1c86e..1b3eff91 100644 --- a/bcftools/vcfroh.c +++ b/bcftools/vcfroh.c @@ -1,6 +1,6 @@ /* vcfroh.c -- HMM model for detecting runs of autozygosity. - Copyright (C) 2013-2022 Genome Research Ltd. + Copyright (C) 2013-2025 Genome Research Ltd. Author: Petr Danecek @@ -1076,40 +1076,48 @@ static void usage(args_t *args) fprintf(stderr, "Usage: bcftools roh [options] \n"); fprintf(stderr, "\n"); fprintf(stderr, "General Options:\n"); - fprintf(stderr, " --AF-dflt if AF is not known, use this allele frequency [skip]\n"); - fprintf(stderr, " --AF-tag use TAG for allele frequency\n"); - fprintf(stderr, " --AF-file read allele frequencies from file (CHR\\tPOS\\tREF,ALT\\tAF)\n"); - fprintf(stderr, " -b --buffer-size buffer size and the number of overlapping sites, 0 for unlimited [0]\n"); - fprintf(stderr, " If the first number is negative, it is interpreted as the maximum memory to\n"); - fprintf(stderr, " use, in MB. The default overlap is set to roughly 1%% of the buffer size.\n"); - fprintf(stderr, " -e, --estimate-AF [TAG], estimate AF from FORMAT/TAG (GT or PL) of all samples (\"-\") or samples listed\n"); - fprintf(stderr, " in . If TAG is not given, the frequency is estimated from GT by default\n"); - fprintf(stderr, " --exclude exclude sites for which the expression is true\n"); - fprintf(stderr, " -G, --GTs-only use GTs and ignore PLs, instead using for PL of the two least likely genotypes.\n"); - fprintf(stderr, " Safe value to use is 30 to account for GT errors.\n"); - fprintf(stderr, " --include select sites for which the expression is true\n"); - fprintf(stderr, " -i, --ignore-homref skip hom-ref genotypes (0/0)\n"); - fprintf(stderr, " --include-noalt include sites with no ALT allele (ignored by default)\n"); - fprintf(stderr, " -I, --skip-indels skip indels as their genotypes are enriched for errors\n"); - fprintf(stderr, " -m, --genetic-map genetic map in IMPUTE2 format, single file or mask, where string \"{CHROM}\"\n"); - fprintf(stderr, " is replaced with chromosome name\n"); - fprintf(stderr, " -M, --rec-rate constant recombination rate per bp\n"); - fprintf(stderr, " -o, --output write output to a file [standard output]\n"); - fprintf(stderr, " -O, --output-type [srz] output s:per-site, r:regions, z:compressed [sr]\n"); - fprintf(stderr, " -r, --regions restrict to comma-separated list of regions\n"); - fprintf(stderr, " -R, --regions-file restrict to regions listed in a file\n"); - fprintf(stderr, " --regions-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1]\n"); - fprintf(stderr, " -s, --samples list of samples to analyze [all samples]\n"); - fprintf(stderr, " -S, --samples-file file of samples to analyze [all samples]\n"); - fprintf(stderr, " -t, --targets similar to -r but streams rather than index-jumps\n"); - fprintf(stderr, " -T, --targets-file similar to -R but streams rather than index-jumps\n"); - fprintf(stderr, " --targets-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n"); - fprintf(stderr, " --threads use multithreading with worker threads [0]\n"); + fprintf(stderr, " --AF-dflt FLOAT If AF is not known, use this allele frequency [skip]\n"); + fprintf(stderr, " --AF-tag TAG Use TAG for allele frequency\n"); + fprintf(stderr, " --AF-file FILE read allele frequencies from file (CHR\\tPOS\\tREF,ALT\\tAF)\n"); + fprintf(stderr, " -b --buffer-size INT[,INT] Buffer size and the number of overlapping sites, 0 for unlimited [0]\n"); + fprintf(stderr, " If the first number is negative, it is interpreted as the maximum memory to\n"); + fprintf(stderr, " use, in MB. The default overlap is set to roughly 1%% of the buffer size.\n"); + fprintf(stderr, " -e, --estimate-AF [TAG],FILE Estimate AF from FORMAT/TAG (GT or PL) of all samples (\"-\") or samples listed\n"); + fprintf(stderr, " in FILE. If TAG is not given, the frequency is estimated from GT by default\n"); + fprintf(stderr, " --exclude EXPR Exclude sites for which the expression is true\n"); + fprintf(stderr, " -G, --GTs-only FLOAT Use GTs and ignore PLs, instead using FLOAT for PL of the two least likely genotypes.\n"); + fprintf(stderr, " Safe value to use is 30 to account for GT errors.\n"); + fprintf(stderr, " --include EXPR Select sites for which the expression is true\n"); + fprintf(stderr, " -i, --ignore-homref Skip hom-ref genotypes (0/0)\n"); + fprintf(stderr, " --include-noalt Include sites with no ALT allele (ignored by default)\n"); + fprintf(stderr, " -I, --skip-indels Skip indels as their genotypes are enriched for errors\n"); + fprintf(stderr, " -m, --genetic-map FILE Genetic map in IMPUTE2 format, single file or mask, where string \"{CHROM}\"\n"); + fprintf(stderr, " is replaced with chromosome name\n"); + fprintf(stderr, " -M, --rec-rate FLOAT Constant recombination rate per bp\n"); + fprintf(stderr, " -o, --output FILE Write output to a file [standard output]\n"); + fprintf(stderr, " -O, --output-type [srz] Output s:per-site, r:regions, z:compressed [sr]\n"); + fprintf(stderr, " -r, --regions REGION Restrict to comma-separated list of regions\n"); + fprintf(stderr, " -R, --regions-file FILE Restrict to regions listed in a file\n"); + fprintf(stderr, " --regions-overlap 0|1|2 include if POS in the region (0), record overlaps (1), variant overlaps (2) [1]\n"); + fprintf(stderr, " -s, --samples LIST List of samples to analyze [all samples]\n"); + fprintf(stderr, " -S, --samples-file FILE File of samples to analyze [all samples]\n"); + fprintf(stderr, " -t, --targets REGION Similar to -r but streams rather than index-jumps\n"); + fprintf(stderr, " -T, --targets-file FILE Similar to -R but streams rather than index-jumps\n"); + fprintf(stderr, " --targets-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n"); + fprintf(stderr, " --threads INT Use multithreading with worker threads [0]\n"); + fprintf(stderr, " -v, --verbosity INT Verbosity level\n"); fprintf(stderr, "\n"); fprintf(stderr, "HMM Options:\n"); - fprintf(stderr, " -a, --hw-to-az P(AZ|HW) transition probability from HW (Hardy-Weinberg) to AZ (autozygous) state [6.7e-8]\n"); - fprintf(stderr, " -H, --az-to-hw P(HW|AZ) transition probability from AZ to HW state [5e-9]\n"); - fprintf(stderr, " -V, --viterbi-training estimate HMM parameters, is the convergence threshold, e.g. 1e-10 (experimental)\n"); + fprintf(stderr, " -a, --hw-to-az FLOAT P(AZ|HW) transition probability from HW (Hardy-Weinberg) to AZ (autozygous) state [6.7e-8]\n"); + fprintf(stderr, " -H, --az-to-hw FLOAT P(HW|AZ) transition probability from AZ to HW state [5e-9]\n"); + fprintf(stderr, " -V, --viterbi-training FLOAT Estimate HMM parameters, FLOAT is the convergence threshold, e.g. 1e-10 (experimental)\n"); + fprintf(stderr, "\n"); + fprintf(stderr, "Example:\n"); + fprintf(stderr, " # Find RoH regions assuming default allele frequency 0.4\n"); + fprintf(stderr, " bcftools roh -G30 --AF-dflt 0.4 test.vcf -o out.txt\n"); + fprintf(stderr, "\n"); + fprintf(stderr, " # Create HTML/JavaScript visualization with the accompanied roh-viz script\n"); + fprintf(stderr, " misc/roh-viz -i out.txt -v test.vcf -o out.html\n"); fprintf(stderr, "\n"); exit(1); } @@ -1156,13 +1164,17 @@ int main_vcfroh(int argc, char *argv[]) {"rec-rate",1,0,'M'}, {"skip-indels",0,0,'I'}, {"threads",1,0,9}, + {"verbosity",required_argument,NULL,'v'}, {0,0,0,0} }; int naf_opts = 0; char *tmp; - while ((c = getopt_long(argc, argv, "h?r:R:t:T:H:a:s:S:m:M:G:Ia:e:V:b:O:o:i",loptions,NULL)) >= 0) { + while ((c = getopt_long(argc, argv, "h?r:R:t:T:H:a:s:S:m:M:G:Ia:e:V:b:O:o:iv:",loptions,NULL)) >= 0) { switch (c) { + case 'v': + if ( apply_verbosity(optarg) < 0 ) error("Could not parse argument: --verbosity %s\n", optarg); + break; case 0: args->af_tag = optarg; naf_opts++; break; case 1: args->af_fname = optarg; naf_opts++; break; case 2: diff --git a/bcftools/vcfroh.c.pysam.c b/bcftools/vcfroh.c.pysam.c index 7519c6e1..a303d2e2 100644 --- a/bcftools/vcfroh.c.pysam.c +++ b/bcftools/vcfroh.c.pysam.c @@ -2,7 +2,7 @@ /* vcfroh.c -- HMM model for detecting runs of autozygosity. - Copyright (C) 2013-2022 Genome Research Ltd. + Copyright (C) 2013-2025 Genome Research Ltd. Author: Petr Danecek @@ -1078,40 +1078,48 @@ static void usage(args_t *args) fprintf(bcftools_stderr, "Usage: bcftools roh [options] \n"); fprintf(bcftools_stderr, "\n"); fprintf(bcftools_stderr, "General Options:\n"); - fprintf(bcftools_stderr, " --AF-dflt if AF is not known, use this allele frequency [skip]\n"); - fprintf(bcftools_stderr, " --AF-tag use TAG for allele frequency\n"); - fprintf(bcftools_stderr, " --AF-file read allele frequencies from file (CHR\\tPOS\\tREF,ALT\\tAF)\n"); - fprintf(bcftools_stderr, " -b --buffer-size buffer size and the number of overlapping sites, 0 for unlimited [0]\n"); - fprintf(bcftools_stderr, " If the first number is negative, it is interpreted as the maximum memory to\n"); - fprintf(bcftools_stderr, " use, in MB. The default overlap is set to roughly 1%% of the buffer size.\n"); - fprintf(bcftools_stderr, " -e, --estimate-AF [TAG], estimate AF from FORMAT/TAG (GT or PL) of all samples (\"-\") or samples listed\n"); - fprintf(bcftools_stderr, " in . If TAG is not given, the frequency is estimated from GT by default\n"); - fprintf(bcftools_stderr, " --exclude exclude sites for which the expression is true\n"); - fprintf(bcftools_stderr, " -G, --GTs-only use GTs and ignore PLs, instead using for PL of the two least likely genotypes.\n"); - fprintf(bcftools_stderr, " Safe value to use is 30 to account for GT errors.\n"); - fprintf(bcftools_stderr, " --include select sites for which the expression is true\n"); - fprintf(bcftools_stderr, " -i, --ignore-homref skip hom-ref genotypes (0/0)\n"); - fprintf(bcftools_stderr, " --include-noalt include sites with no ALT allele (ignored by default)\n"); - fprintf(bcftools_stderr, " -I, --skip-indels skip indels as their genotypes are enriched for errors\n"); - fprintf(bcftools_stderr, " -m, --genetic-map genetic map in IMPUTE2 format, single file or mask, where string \"{CHROM}\"\n"); - fprintf(bcftools_stderr, " is replaced with chromosome name\n"); - fprintf(bcftools_stderr, " -M, --rec-rate constant recombination rate per bp\n"); - fprintf(bcftools_stderr, " -o, --output write output to a file [standard output]\n"); - fprintf(bcftools_stderr, " -O, --output-type [srz] output s:per-site, r:regions, z:compressed [sr]\n"); - fprintf(bcftools_stderr, " -r, --regions restrict to comma-separated list of regions\n"); - fprintf(bcftools_stderr, " -R, --regions-file restrict to regions listed in a file\n"); - fprintf(bcftools_stderr, " --regions-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1]\n"); - fprintf(bcftools_stderr, " -s, --samples list of samples to analyze [all samples]\n"); - fprintf(bcftools_stderr, " -S, --samples-file file of samples to analyze [all samples]\n"); - fprintf(bcftools_stderr, " -t, --targets similar to -r but streams rather than index-jumps\n"); - fprintf(bcftools_stderr, " -T, --targets-file similar to -R but streams rather than index-jumps\n"); - fprintf(bcftools_stderr, " --targets-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n"); - fprintf(bcftools_stderr, " --threads use multithreading with worker threads [0]\n"); + fprintf(bcftools_stderr, " --AF-dflt FLOAT If AF is not known, use this allele frequency [skip]\n"); + fprintf(bcftools_stderr, " --AF-tag TAG Use TAG for allele frequency\n"); + fprintf(bcftools_stderr, " --AF-file FILE read allele frequencies from file (CHR\\tPOS\\tREF,ALT\\tAF)\n"); + fprintf(bcftools_stderr, " -b --buffer-size INT[,INT] Buffer size and the number of overlapping sites, 0 for unlimited [0]\n"); + fprintf(bcftools_stderr, " If the first number is negative, it is interpreted as the maximum memory to\n"); + fprintf(bcftools_stderr, " use, in MB. The default overlap is set to roughly 1%% of the buffer size.\n"); + fprintf(bcftools_stderr, " -e, --estimate-AF [TAG],FILE Estimate AF from FORMAT/TAG (GT or PL) of all samples (\"-\") or samples listed\n"); + fprintf(bcftools_stderr, " in FILE. If TAG is not given, the frequency is estimated from GT by default\n"); + fprintf(bcftools_stderr, " --exclude EXPR Exclude sites for which the expression is true\n"); + fprintf(bcftools_stderr, " -G, --GTs-only FLOAT Use GTs and ignore PLs, instead using FLOAT for PL of the two least likely genotypes.\n"); + fprintf(bcftools_stderr, " Safe value to use is 30 to account for GT errors.\n"); + fprintf(bcftools_stderr, " --include EXPR Select sites for which the expression is true\n"); + fprintf(bcftools_stderr, " -i, --ignore-homref Skip hom-ref genotypes (0/0)\n"); + fprintf(bcftools_stderr, " --include-noalt Include sites with no ALT allele (ignored by default)\n"); + fprintf(bcftools_stderr, " -I, --skip-indels Skip indels as their genotypes are enriched for errors\n"); + fprintf(bcftools_stderr, " -m, --genetic-map FILE Genetic map in IMPUTE2 format, single file or mask, where string \"{CHROM}\"\n"); + fprintf(bcftools_stderr, " is replaced with chromosome name\n"); + fprintf(bcftools_stderr, " -M, --rec-rate FLOAT Constant recombination rate per bp\n"); + fprintf(bcftools_stderr, " -o, --output FILE Write output to a file [standard output]\n"); + fprintf(bcftools_stderr, " -O, --output-type [srz] Output s:per-site, r:regions, z:compressed [sr]\n"); + fprintf(bcftools_stderr, " -r, --regions REGION Restrict to comma-separated list of regions\n"); + fprintf(bcftools_stderr, " -R, --regions-file FILE Restrict to regions listed in a file\n"); + fprintf(bcftools_stderr, " --regions-overlap 0|1|2 include if POS in the region (0), record overlaps (1), variant overlaps (2) [1]\n"); + fprintf(bcftools_stderr, " -s, --samples LIST List of samples to analyze [all samples]\n"); + fprintf(bcftools_stderr, " -S, --samples-file FILE File of samples to analyze [all samples]\n"); + fprintf(bcftools_stderr, " -t, --targets REGION Similar to -r but streams rather than index-jumps\n"); + fprintf(bcftools_stderr, " -T, --targets-file FILE Similar to -R but streams rather than index-jumps\n"); + fprintf(bcftools_stderr, " --targets-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n"); + fprintf(bcftools_stderr, " --threads INT Use multithreading with worker threads [0]\n"); + fprintf(bcftools_stderr, " -v, --verbosity INT Verbosity level\n"); fprintf(bcftools_stderr, "\n"); fprintf(bcftools_stderr, "HMM Options:\n"); - fprintf(bcftools_stderr, " -a, --hw-to-az P(AZ|HW) transition probability from HW (Hardy-Weinberg) to AZ (autozygous) state [6.7e-8]\n"); - fprintf(bcftools_stderr, " -H, --az-to-hw P(HW|AZ) transition probability from AZ to HW state [5e-9]\n"); - fprintf(bcftools_stderr, " -V, --viterbi-training estimate HMM parameters, is the convergence threshold, e.g. 1e-10 (experimental)\n"); + fprintf(bcftools_stderr, " -a, --hw-to-az FLOAT P(AZ|HW) transition probability from HW (Hardy-Weinberg) to AZ (autozygous) state [6.7e-8]\n"); + fprintf(bcftools_stderr, " -H, --az-to-hw FLOAT P(HW|AZ) transition probability from AZ to HW state [5e-9]\n"); + fprintf(bcftools_stderr, " -V, --viterbi-training FLOAT Estimate HMM parameters, FLOAT is the convergence threshold, e.g. 1e-10 (experimental)\n"); + fprintf(bcftools_stderr, "\n"); + fprintf(bcftools_stderr, "Example:\n"); + fprintf(bcftools_stderr, " # Find RoH regions assuming default allele frequency 0.4\n"); + fprintf(bcftools_stderr, " bcftools roh -G30 --AF-dflt 0.4 test.vcf -o out.txt\n"); + fprintf(bcftools_stderr, "\n"); + fprintf(bcftools_stderr, " # Create HTML/JavaScript visualization with the accompanied roh-viz script\n"); + fprintf(bcftools_stderr, " misc/roh-viz -i out.txt -v test.vcf -o out.html\n"); fprintf(bcftools_stderr, "\n"); bcftools_exit(1); } @@ -1158,13 +1166,17 @@ int main_vcfroh(int argc, char *argv[]) {"rec-rate",1,0,'M'}, {"skip-indels",0,0,'I'}, {"threads",1,0,9}, + {"verbosity",required_argument,NULL,'v'}, {0,0,0,0} }; int naf_opts = 0; char *tmp; - while ((c = getopt_long(argc, argv, "h?r:R:t:T:H:a:s:S:m:M:G:Ia:e:V:b:O:o:i",loptions,NULL)) >= 0) { + while ((c = getopt_long(argc, argv, "h?r:R:t:T:H:a:s:S:m:M:G:Ia:e:V:b:O:o:iv:",loptions,NULL)) >= 0) { switch (c) { + case 'v': + if ( apply_verbosity(optarg) < 0 ) error("Could not parse argument: --verbosity %s\n", optarg); + break; case 0: args->af_tag = optarg; naf_opts++; break; case 1: args->af_fname = optarg; naf_opts++; break; case 2: diff --git a/bcftools/vcfsort.c b/bcftools/vcfsort.c index 6e21f85b..c333b761 100644 --- a/bcftools/vcfsort.c +++ b/bcftools/vcfsort.c @@ -1,6 +1,6 @@ /* vcfsort.c -- sort subcommand - Copyright (C) 2017-2024 Genome Research Ltd. + Copyright (C) 2017-2025 Genome Research Ltd. Author: Petr Danecek @@ -158,7 +158,7 @@ static int cmp_packed_bcf_pos_ref_alt(const void *aptr, const void *bptr) if ( a->rid > b->rid ) return 1; if ( a->pos < b->pos ) return -1; if ( a->pos > b->pos ) return 1; - + // Sort lexicographically by ref,alt. These are stored tab-separated // as the first item in packed_bcf_t::data return strcmp((char *) a->data, (char *) b->data); @@ -193,7 +193,7 @@ static uint8_t *pack_unsigned(uint8_t *data, uint64_t val) *data++ = (val & 0x7f) | ((val > 0x7f) ? 0x80 : 0); val >>= 7; } while (val > 0); - return data; + return data; } static uint8_t *pack_hts_pos(uint8_t *data, hts_pos_t val) @@ -258,7 +258,7 @@ static int write_packed_bcf(BGZF *fp, packed_bcf_t *src) // Skip the copy of the alleles size_t skip = strlen((char *) src->data) + 1; - + // Write everything else if (src->len < SIZE_MAX) { @@ -337,7 +337,7 @@ static int read_packed_bcf(BGZF *fp, bcf1_t *dest) int err = 0; packed_bcf_t tmp; size_t len = tmp.data - (uint8_t *) &tmp.pos; - + bcf_clear(dest); ssize_t got = bgzf_read_small(fp, &tmp.pos, len); if (got == 0) @@ -698,15 +698,16 @@ static void usage(args_t *args) fprintf(stderr, "Usage: bcftools sort [OPTIONS] \n"); fprintf(stderr, "\n"); fprintf(stderr, "Options:\n"); - fprintf(stderr, " -m, --max-mem FLOAT[kMG] maximum memory to use [768M]\n"); // using metric units, 1M=1e6 - fprintf(stderr, " -o, --output FILE output file name [stdout]\n"); + fprintf(stderr, " -m, --max-mem FLOAT[kMG] Maximum memory to use [768M]\n"); // using metric units, 1M=1e6 + fprintf(stderr, " -o, --output FILE Output file name [stdout]\n"); fprintf(stderr, " -O, --output-type u|b|v|z[0-9] u/b: un/compressed BCF, v/z: un/compressed VCF, 0-9: compression level [v]\n"); #ifdef _WIN32 - fprintf(stderr, " -T, --temp-dir DIR temporary files [/bcftools.XXXXXX]\n"); + fprintf(stderr, " -T, --temp-dir DIR Temporary files [/bcftools.XXXXXX]\n"); #else - fprintf(stderr, " -T, --temp-dir DIR temporary files [/tmp/bcftools.XXXXXX]\n"); + fprintf(stderr, " -T, --temp-dir DIR Temporary files [/tmp/bcftools.XXXXXX]\n"); #endif + fprintf(stderr, " -v, --verbosity INT Verbosity level\n"); fprintf(stderr, " -W, --write-index[=FMT] Automatically index the output files [off]\n"); fprintf(stderr, "\n"); exit(1); @@ -780,13 +781,17 @@ int main_sort(int argc, char *argv[]) {"output",required_argument,NULL,'o'}, {"help",no_argument,NULL,'h'}, {"write-index",optional_argument,NULL,'W'}, + {"verbosity",required_argument,NULL,'v'}, {0,0,0,0} }; char *tmp; - while ((c = getopt_long(argc, argv, "m:T:O:o:W::h?",loptions,NULL)) >= 0) + while ((c = getopt_long(argc, argv, "m:T:O:o:W::h?v:",loptions,NULL)) >= 0) { switch (c) { + case 'v': + if ( apply_verbosity(optarg) < 0 ) error("Could not parse argument: --verbosity %s\n", optarg); + break; case 'm': args->max_mem = parse_mem_string(optarg); break; case 'T': args->tmp_dir = optarg; break; case 'o': args->output_fname = optarg; break; diff --git a/bcftools/vcfsort.c.pysam.c b/bcftools/vcfsort.c.pysam.c index da899ae7..d6b86d07 100644 --- a/bcftools/vcfsort.c.pysam.c +++ b/bcftools/vcfsort.c.pysam.c @@ -2,7 +2,7 @@ /* vcfsort.c -- sort subcommand - Copyright (C) 2017-2024 Genome Research Ltd. + Copyright (C) 2017-2025 Genome Research Ltd. Author: Petr Danecek @@ -160,7 +160,7 @@ static int cmp_packed_bcf_pos_ref_alt(const void *aptr, const void *bptr) if ( a->rid > b->rid ) return 1; if ( a->pos < b->pos ) return -1; if ( a->pos > b->pos ) return 1; - + // Sort lexicographically by ref,alt. These are stored tab-separated // as the first item in packed_bcf_t::data return strcmp((char *) a->data, (char *) b->data); @@ -195,7 +195,7 @@ static uint8_t *pack_unsigned(uint8_t *data, uint64_t val) *data++ = (val & 0x7f) | ((val > 0x7f) ? 0x80 : 0); val >>= 7; } while (val > 0); - return data; + return data; } static uint8_t *pack_hts_pos(uint8_t *data, hts_pos_t val) @@ -260,7 +260,7 @@ static int write_packed_bcf(BGZF *fp, packed_bcf_t *src) // Skip the copy of the alleles size_t skip = strlen((char *) src->data) + 1; - + // Write everything else if (src->len < SIZE_MAX) { @@ -339,7 +339,7 @@ static int read_packed_bcf(BGZF *fp, bcf1_t *dest) int err = 0; packed_bcf_t tmp; size_t len = tmp.data - (uint8_t *) &tmp.pos; - + bcf_clear(dest); ssize_t got = bgzf_read_small(fp, &tmp.pos, len); if (got == 0) @@ -700,15 +700,16 @@ static void usage(args_t *args) fprintf(bcftools_stderr, "Usage: bcftools sort [OPTIONS] \n"); fprintf(bcftools_stderr, "\n"); fprintf(bcftools_stderr, "Options:\n"); - fprintf(bcftools_stderr, " -m, --max-mem FLOAT[kMG] maximum memory to use [768M]\n"); // using metric units, 1M=1e6 - fprintf(bcftools_stderr, " -o, --output FILE output file name [bcftools_stdout]\n"); + fprintf(bcftools_stderr, " -m, --max-mem FLOAT[kMG] Maximum memory to use [768M]\n"); // using metric units, 1M=1e6 + fprintf(bcftools_stderr, " -o, --output FILE Output file name [bcftools_stdout]\n"); fprintf(bcftools_stderr, " -O, --output-type u|b|v|z[0-9] u/b: un/compressed BCF, v/z: un/compressed VCF, 0-9: compression level [v]\n"); #ifdef _WIN32 - fprintf(bcftools_stderr, " -T, --temp-dir DIR temporary files [/bcftools.XXXXXX]\n"); + fprintf(bcftools_stderr, " -T, --temp-dir DIR Temporary files [/bcftools.XXXXXX]\n"); #else - fprintf(bcftools_stderr, " -T, --temp-dir DIR temporary files [/tmp/bcftools.XXXXXX]\n"); + fprintf(bcftools_stderr, " -T, --temp-dir DIR Temporary files [/tmp/bcftools.XXXXXX]\n"); #endif + fprintf(bcftools_stderr, " -v, --verbosity INT Verbosity level\n"); fprintf(bcftools_stderr, " -W, --write-index[=FMT] Automatically index the output files [off]\n"); fprintf(bcftools_stderr, "\n"); bcftools_exit(1); @@ -782,13 +783,17 @@ int main_sort(int argc, char *argv[]) {"output",required_argument,NULL,'o'}, {"help",no_argument,NULL,'h'}, {"write-index",optional_argument,NULL,'W'}, + {"verbosity",required_argument,NULL,'v'}, {0,0,0,0} }; char *tmp; - while ((c = getopt_long(argc, argv, "m:T:O:o:W::h?",loptions,NULL)) >= 0) + while ((c = getopt_long(argc, argv, "m:T:O:o:W::h?v:",loptions,NULL)) >= 0) { switch (c) { + case 'v': + if ( apply_verbosity(optarg) < 0 ) error("Could not parse argument: --verbosity %s\n", optarg); + break; case 'm': args->max_mem = parse_mem_string(optarg); break; case 'T': args->tmp_dir = optarg; break; case 'o': args->output_fname = optarg; break; diff --git a/bcftools/vcfstats.c b/bcftools/vcfstats.c index 38b4caf5..b0487126 100644 --- a/bcftools/vcfstats.c +++ b/bcftools/vcfstats.c @@ -1,6 +1,6 @@ /* vcfstats.c -- Produces stats which can be plotted using plot-vcfstats. - Copyright (C) 2012-2024 Genome Research Ltd. + Copyright (C) 2012-2025 Genome Research Ltd. Author: Petr Danecek @@ -1903,7 +1903,7 @@ static void usage(void) fprintf(stderr, " -u, --user-tstv TAG[:min:max:n] Collect Ts/Tv stats for any tag using the given binning [0:1:100]\n"); fprintf(stderr, " A subfield can be selected as e.g. 'PV4[0]', here the first value of the PV4 tag\n"); fprintf(stderr, " --threads INT Use multithreading with worker threads [0]\n"); - fprintf(stderr, " -v, --verbose Produce verbose per-site and per-sample output\n"); + fprintf(stderr, " -v, --verbosity INT Verbosity level\n"); fprintf(stderr, "\n"); exit(1); } @@ -1931,7 +1931,8 @@ int main_vcfstats(int argc, char *argv[]) {"regions",1,0,'r'}, {"regions-file",1,0,'R'}, {"regions-overlap",required_argument,NULL,3}, - {"verbose",0,0,'v'}, + {"verbose",optional_argument,0,'v'}, + {"verbosity",optional_argument,0,'v'}, {"depth",1,0,'d'}, {"apply-filters",1,0,'f'}, {"exons",1,0,'E'}, @@ -1946,7 +1947,7 @@ int main_vcfstats(int argc, char *argv[]) {"threads",1,0,9}, {0,0,0,0} }; - while ((c = getopt_long(argc, argv, "hc:r:R:e:s:S:d:i:t:T:F:f:1u:vIE:",loptions,NULL)) >= 0) { + while ((c = getopt_long(argc, argv, "hc:r:R:e:s:S:d:i:t:T:F:f:1u:v::IE:",loptions,NULL)) >= 0) { switch (c) { case 1 : args->af_bins_list = optarg; break; case 2 : args->af_tag = optarg; break; @@ -1965,7 +1966,16 @@ int main_vcfstats(int argc, char *argv[]) else if ( !strcmp(optarg,"none") ) args->files->collapse = COLLAPSE_NONE; else error("The --collapse string \"%s\" not recognised.\n", optarg); break; - case 'v': args->verbose_sites = 1; break; + case 'v': + if ( !optarg ) args->verbose_sites = 1; + else + { + char *tmp; + args->verbose_sites = strtol(optarg,&tmp,10); + if ( *tmp || args->verbose_sites<0 ) error("Could not parse argument: --verbosity %s\n", optarg); + if ( args->verbose_sites > 3 ) hts_verbose = args->verbose_sites; + } + break; case 'd': if ( sscanf(optarg,"%d,%d,%d",&args->dp_min,&args->dp_max,&args->dp_step)!=3 ) error("Could not parse --depth %s\n", optarg); diff --git a/bcftools/vcfstats.c.pysam.c b/bcftools/vcfstats.c.pysam.c index 946032ed..8670b5af 100644 --- a/bcftools/vcfstats.c.pysam.c +++ b/bcftools/vcfstats.c.pysam.c @@ -2,7 +2,7 @@ /* vcfstats.c -- Produces stats which can be plotted using plot-vcfstats. - Copyright (C) 2012-2024 Genome Research Ltd. + Copyright (C) 2012-2025 Genome Research Ltd. Author: Petr Danecek @@ -1905,7 +1905,7 @@ static void usage(void) fprintf(bcftools_stderr, " -u, --user-tstv TAG[:min:max:n] Collect Ts/Tv stats for any tag using the given binning [0:1:100]\n"); fprintf(bcftools_stderr, " A subfield can be selected as e.g. 'PV4[0]', here the first value of the PV4 tag\n"); fprintf(bcftools_stderr, " --threads INT Use multithreading with worker threads [0]\n"); - fprintf(bcftools_stderr, " -v, --verbose Produce verbose per-site and per-sample output\n"); + fprintf(bcftools_stderr, " -v, --verbosity INT Verbosity level\n"); fprintf(bcftools_stderr, "\n"); bcftools_exit(1); } @@ -1933,7 +1933,8 @@ int main_vcfstats(int argc, char *argv[]) {"regions",1,0,'r'}, {"regions-file",1,0,'R'}, {"regions-overlap",required_argument,NULL,3}, - {"verbose",0,0,'v'}, + {"verbose",optional_argument,0,'v'}, + {"verbosity",optional_argument,0,'v'}, {"depth",1,0,'d'}, {"apply-filters",1,0,'f'}, {"exons",1,0,'E'}, @@ -1948,7 +1949,7 @@ int main_vcfstats(int argc, char *argv[]) {"threads",1,0,9}, {0,0,0,0} }; - while ((c = getopt_long(argc, argv, "hc:r:R:e:s:S:d:i:t:T:F:f:1u:vIE:",loptions,NULL)) >= 0) { + while ((c = getopt_long(argc, argv, "hc:r:R:e:s:S:d:i:t:T:F:f:1u:v::IE:",loptions,NULL)) >= 0) { switch (c) { case 1 : args->af_bins_list = optarg; break; case 2 : args->af_tag = optarg; break; @@ -1967,7 +1968,16 @@ int main_vcfstats(int argc, char *argv[]) else if ( !strcmp(optarg,"none") ) args->files->collapse = COLLAPSE_NONE; else error("The --collapse string \"%s\" not recognised.\n", optarg); break; - case 'v': args->verbose_sites = 1; break; + case 'v': + if ( !optarg ) args->verbose_sites = 1; + else + { + char *tmp; + args->verbose_sites = strtol(optarg,&tmp,10); + if ( *tmp || args->verbose_sites<0 ) error("Could not parse argument: --verbosity %s\n", optarg); + if ( args->verbose_sites > 3 ) hts_verbose = args->verbose_sites; + } + break; case 'd': if ( sscanf(optarg,"%d,%d,%d",&args->dp_min,&args->dp_max,&args->dp_step)!=3 ) error("Could not parse --depth %s\n", optarg); diff --git a/bcftools/vcfview.c b/bcftools/vcfview.c index 58063ebb..17b9b70c 100644 --- a/bcftools/vcfview.c +++ b/bcftools/vcfview.c @@ -1,6 +1,6 @@ /* vcfview.c -- VCF/BCF conversion, view, subset and filter VCF/BCF files. - Copyright (C) 2013-2023 Genome Research Ltd. + Copyright (C) 2013-2025 Genome Research Ltd. Author: Shane McCarthy @@ -525,6 +525,7 @@ static void usage(args_t *args) fprintf(stderr, " -T, --targets-file [^]FILE Similar to -R but streams rather than index-jumps. Exclude regions with \"^\" prefix\n"); fprintf(stderr, " --targets-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n"); fprintf(stderr, " --threads INT Use multithreading with INT worker threads [0]\n"); + fprintf(stderr, " --verbosity INT Verbosity level\n"); fprintf(stderr, "\n"); fprintf(stderr, "Subset options:\n"); fprintf(stderr, " -A, --trim-unseen-allele Remove '<*>' or '' at variant (-A) or at all (-AA) sites\n"); @@ -618,6 +619,7 @@ int main_vcfview(int argc, char *argv[]) {"exclude-phased",no_argument,NULL,'P'}, {"no-version",no_argument,NULL,8}, {"write-index",optional_argument,NULL,'W'}, + {"verbosity",required_argument,NULL,10}, {NULL,0,NULL,0} }; char *tmp; @@ -750,6 +752,10 @@ int main_vcfview(int argc, char *argv[]) break; case 9 : args->n_threads = strtol(optarg, 0, 0); break; case 8 : args->record_cmd_line = 0; break; + case 10 : + if ( apply_verbosity(optarg) < 0 ) error("Could not parse argument: --verbosity %s\n", optarg); + break; + case 'W': if (!(args->write_index = write_index_parse(optarg))) error("Unsupported index format '%s'\n", optarg); diff --git a/bcftools/vcfview.c.pysam.c b/bcftools/vcfview.c.pysam.c index d430bca0..9e0a183e 100644 --- a/bcftools/vcfview.c.pysam.c +++ b/bcftools/vcfview.c.pysam.c @@ -2,7 +2,7 @@ /* vcfview.c -- VCF/BCF conversion, view, subset and filter VCF/BCF files. - Copyright (C) 2013-2023 Genome Research Ltd. + Copyright (C) 2013-2025 Genome Research Ltd. Author: Shane McCarthy @@ -527,6 +527,7 @@ static void usage(args_t *args) fprintf(bcftools_stderr, " -T, --targets-file [^]FILE Similar to -R but streams rather than index-jumps. Exclude regions with \"^\" prefix\n"); fprintf(bcftools_stderr, " --targets-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n"); fprintf(bcftools_stderr, " --threads INT Use multithreading with INT worker threads [0]\n"); + fprintf(bcftools_stderr, " --verbosity INT Verbosity level\n"); fprintf(bcftools_stderr, "\n"); fprintf(bcftools_stderr, "Subset options:\n"); fprintf(bcftools_stderr, " -A, --trim-unseen-allele Remove '<*>' or '' at variant (-A) or at all (-AA) sites\n"); @@ -620,6 +621,7 @@ int main_vcfview(int argc, char *argv[]) {"exclude-phased",no_argument,NULL,'P'}, {"no-version",no_argument,NULL,8}, {"write-index",optional_argument,NULL,'W'}, + {"verbosity",required_argument,NULL,10}, {NULL,0,NULL,0} }; char *tmp; @@ -752,6 +754,10 @@ int main_vcfview(int argc, char *argv[]) break; case 9 : args->n_threads = strtol(optarg, 0, 0); break; case 8 : args->record_cmd_line = 0; break; + case 10 : + if ( apply_verbosity(optarg) < 0 ) error("Could not parse argument: --verbosity %s\n", optarg); + break; + case 'W': if (!(args->write_index = write_index_parse(optarg))) error("Unsupported index format '%s'\n", optarg); diff --git a/bcftools/version.c b/bcftools/version.c index 2defb4fb..bbfd1c4e 100644 --- a/bcftools/version.c +++ b/bcftools/version.c @@ -61,6 +61,15 @@ void error_errno(const char *format, ...) exit(-1); } +int apply_verbosity(const char *str) +{ + char *tmp; + int verbose = strtol(str,&tmp,10); + if ( *tmp || verbose<0 ) return -1; + if ( verbose > 3 ) hts_verbose = verbose; + return 0; +} + const char *hts_bcf_wmode(int file_type) { if ( file_type == FT_BCF ) return "wbu"; // uncompressed BCF diff --git a/bcftools/version.c.pysam.c b/bcftools/version.c.pysam.c index 4944b57e..5262e667 100644 --- a/bcftools/version.c.pysam.c +++ b/bcftools/version.c.pysam.c @@ -63,6 +63,15 @@ void error_errno(const char *format, ...) bcftools_exit(-1); } +int apply_verbosity(const char *str) +{ + char *tmp; + int verbose = strtol(str,&tmp,10); + if ( *tmp || verbose<0 ) return -1; + if ( verbose > 3 ) hts_verbose = verbose; + return 0; +} + const char *hts_bcf_wmode(int file_type) { if ( file_type == FT_BCF ) return "wbu"; // uncompressed BCF diff --git a/bcftools/version.sh b/bcftools/version.sh index 007c916a..6355d1ea 100755 --- a/bcftools/version.sh +++ b/bcftools/version.sh @@ -24,7 +24,7 @@ # DEALINGS IN THE SOFTWARE. # Master version, for use in tarballs or non-git source copies -VERSION=1.21 +VERSION=1.22 # If we have a git clone, then check against the current tag if [ -e .git ] diff --git a/devtools/import.py b/devtools/import.py index 2d955dea..626f3018 100644 --- a/devtools/import.py +++ b/devtools/import.py @@ -32,13 +32,14 @@ "vcf-miniview.c", ), "bcftools": ( + "mpileup2", "test", "plugins", "peakfit.c", "peakfit.h", "polysomy.c"), "htslib": ( 'annot-tsv.c', 'bgzip.c', 'htsfile.c', 'tabix.c', 'hts_probe_cc.sh', - "samples", "test", "tests"), + "ref_cache", "samples", "test", "tests"), } diff --git a/doc/conf.py b/doc/conf.py index 27c389ce..f6ccaed0 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -56,8 +56,8 @@ # Included at the end of each rst file rst_epilog = ''' .. _pysam: https://github.com/pysam-developers/pysam -.. _samtools: https://www.htslib.org/doc/1.21/samtools.html -.. _bcftools: https://www.htslib.org/doc/1.21/bcftools.html +.. _samtools: https://www.htslib.org/doc/1.22/samtools.html +.. _bcftools: https://www.htslib.org/doc/1.22/bcftools.html .. _htslib: https://www.htslib.org/ .. _tabix: https://www.htslib.org/doc/tabix.html .. _Galaxy: https://usegalaxy.org/ diff --git a/doc/index.rst b/doc/index.rst index 17d3d308..ec17ceb1 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -18,7 +18,7 @@ This module provides a low-level wrapper around the htslib_ C-API as using cython and a high-level, pythonic API for convenient access to the data within genomic file formats. -The current version wraps *htslib-1.21*, *samtools-1.21*, and *bcftools-1.21*. +The current version wraps *htslib-1.22*, *samtools-1.22*, and *bcftools-1.22*. To install the latest release, type:: diff --git a/htslib/LICENSE b/htslib/LICENSE index 87931fae..5d5e5f46 100644 --- a/htslib/LICENSE +++ b/htslib/LICENSE @@ -3,7 +3,7 @@ according to the terms of the following MIT/Expat license.] The MIT/Expat License -Copyright (C) 2012-2024 Genome Research Ltd. +Copyright (C) 2012-2025 Genome Research Ltd. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -29,7 +29,7 @@ according to the terms of the following Modified 3-Clause BSD license.] The Modified-BSD License -Copyright (C) 2012-2024 Genome Research Ltd. +Copyright (C) 2012-2025 Genome Research Ltd. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: diff --git a/htslib/Makefile b/htslib/Makefile index 630720b3..d7ef1309 100644 --- a/htslib/Makefile +++ b/htslib/Makefile @@ -151,8 +151,8 @@ LIBHTS_SOVERSION = 3 # is not strictly necessary and should be removed the next time # LIBHTS_SOVERSION is bumped (see #1144 and # https://developer.apple.com/library/archive/documentation/DeveloperTools/Conceptual/DynamicLibraries/100-Articles/DynamicLibraryDesignGuidelines.html#//apple_ref/doc/uid/TP40002013-SW23) -MACH_O_COMPATIBILITY_VERSION = 3.1.21 -MACH_O_CURRENT_VERSION = 3.1.21 +MACH_O_COMPATIBILITY_VERSION = 3.1.22 +MACH_O_CURRENT_VERSION = 3.1.22 # Force version.h to be remade if $(PACKAGE_VERSION) has changed. version.h: $(if $(wildcard version.h),$(if $(findstring "$(PACKAGE_VERSION)",$(shell cat version.h)),,force)) @@ -189,6 +189,8 @@ config_vars.h: .c.pico: $(CC) $(CFLAGS) $(TARGET_CFLAGS) $(ALL_CPPFLAGS) $(EXTRA_CFLAGS_PIC) -c -o $@ $< +ref_cache/%.o: ref_cache/%.c + $(CC) $(CFLAGS) $(REF_CACHE_EXTRA_C_FLAGS) $(TARGET_CFLAGS) $(ALL_CPPFLAGS) -c -o $@ $< LIBHTS_OBJS = \ kfunc.o \ @@ -232,6 +234,24 @@ LIBHTS_OBJS = \ $(HTSCODECS_OBJS) \ $(NONCONFIGURE_OBJS) +REF_CACHE_OBJS = ref_cache/cmsg_wrap.o \ + ref_cache/http_parser.o \ + ref_cache/listener.o \ + ref_cache/log_files.o \ + ref_cache/main.o \ + ref_cache/misc.o \ + ref_cache/ping.o \ + ref_cache/poll_wrap_epoll.o \ + ref_cache/poll_wrap_poll.o \ + ref_cache/ref_files.o \ + ref_cache/request_handler.o \ + ref_cache/sendfile_wrap.o \ + ref_cache/server.o \ + ref_cache/transaction.o \ + ref_cache/upstream.o \ + cram/pooled_alloc.o \ + md5.o + # Without configure we wish to have a rich set of default figures, # but we still need conditional inclusion as we wish to still # support ./configure --disable-blah. @@ -266,6 +286,9 @@ thread_pool_internal_h = thread_pool_internal.h $(htslib_thread_pool_h) # using the configure script or just comment the line out if you are not. -include config.mk +# Add optional built programs here +all: $(REF_CACHE_PROGRAMS) + # Usually config.h is generated by running configure or config.status, # but if those aren't used create a default config.h here. config.h: @@ -305,6 +328,14 @@ config.h: echo '#define HAVE_ATTRIBUTE_TARGET_SSSE3 1' >> $@ echo '#define HAVE_BUILTIN_CPU_SUPPORT_SSSE3 1' >> $@ echo '#endif' >> $@ + echo '#if defined __linux__' >> $@ + echo '#define HAVE_GETAUXVAL' >> $@ + echo '#elif defined __FreeBSD__' >> $@ + echo '#define HAVE_ELF_AUX_INFO' >> $@ + echo '#elif defined __OpenBSD__' >> $@ + echo '// Enable extra OpenBSD checks (see simd.c)' >> $@ + echo '#define HAVE_OPENBSD' >> $@ + echo '#endif' >> $@ # And similarly for htslib.pc.tmp ("pkg-config template"). No dependency # on htslib.pc.in listed, as if that file is newer the usual way to regenerate @@ -486,13 +517,13 @@ cram/cram_decode.o cram/cram_decode.pico: cram/cram_decode.c config.h $(cram_h) cram/cram_encode.o cram/cram_encode.pico: cram/cram_encode.c config.h $(cram_h) $(cram_os_h) $(sam_internal_h) $(htslib_hts_h) $(htslib_hts_endian_h) $(textutils_internal_h) cram/cram_external.o cram/cram_external.pico: cram/cram_external.c config.h $(htscodecs_rANS_static4x16_h) $(htslib_hfile_h) $(cram_h) cram/cram_index.o cram/cram_index.pico: cram/cram_index.c config.h $(htslib_bgzf_h) $(htslib_hfile_h) $(hts_internal_h) $(cram_h) $(cram_os_h) -cram/cram_io.o cram/cram_io.pico: cram/cram_io.c config.h os/lzma_stub.h $(fuzz_settings_h) $(cram_h) $(cram_os_h) $(htslib_hts_h) $(cram_open_trace_file_h) $(htscodecs_rANS_static_h) $(htscodecs_rANS_static4x16_h) $(htscodecs_arith_dynamic_h) $(htscodecs_tokenise_name3_h) $(htscodecs_fqzcomp_qual_h) $(htscodecs_varint_h) $(htslib_hfile_h) $(htslib_bgzf_h) $(htslib_faidx_h) $(hts_internal_h) +cram/cram_io.o cram/cram_io.pico: cram/cram_io.c config.h os/lzma_stub.h $(fuzz_settings_h) $(cram_h) $(cram_os_h) $(htslib_hts_h) $(hts_internal_h) $(cram_open_trace_file_h) $(htscodecs_rANS_static_h) $(htscodecs_rANS_static4x16_h) $(htscodecs_arith_dynamic_h) $(htscodecs_tokenise_name3_h) $(htscodecs_fqzcomp_qual_h) $(htscodecs_varint_h) $(htslib_hfile_h) $(htslib_bgzf_h) $(htslib_faidx_h) $(hts_internal_h) cram/cram_stats.o cram/cram_stats.pico: cram/cram_stats.c config.h $(cram_h) $(cram_os_h) cram/mFILE.o cram/mFILE.pico: cram/mFILE.c config.h $(htslib_hts_log_h) $(cram_os_h) cram/mFILE.h cram/open_trace_file.o cram/open_trace_file.pico: cram/open_trace_file.c config.h $(cram_os_h) $(cram_open_trace_file_h) $(cram_misc_h) $(htslib_hfile_h) $(htslib_hts_log_h) $(htslib_hts_h) cram/pooled_alloc.o cram/pooled_alloc.pico: cram/pooled_alloc.c config.h cram/pooled_alloc.h $(cram_misc_h) cram/string_alloc.o cram/string_alloc.pico: cram/string_alloc.c config.h cram/string_alloc.h -thread_pool.o thread_pool.pico: thread_pool.c config.h $(thread_pool_internal_h) $(htslib_hts_log_h) +thread_pool.o thread_pool.pico: thread_pool.c config.h $(thread_pool_internal_h) $(htslib_hts_log_h) $(hts_internal_h) htscodecs/htscodecs/arith_dynamic.o htscodecs/htscodecs/arith_dynamic.pico: htscodecs/htscodecs/arith_dynamic.c config.h $(htscodecs_arith_dynamic_h) $(htscodecs_varint_h) $(htscodecs_pack_h) $(htscodecs_utils_h) $(htscodecs_c_simple_model_h) htscodecs/htscodecs/fqzcomp_qual.o htscodecs/htscodecs/fqzcomp_qual.pico: htscodecs/htscodecs/fqzcomp_qual.c config.h $(htscodecs_fqzcomp_qual_h) $(htscodecs_varint_h) $(htscodecs_utils_h) $(htscodecs_c_simple_model_h) @@ -526,11 +557,46 @@ htsfile: htsfile.o libhts.a tabix: tabix.o libhts.a $(CC) $(LDFLAGS) -o $@ tabix.o libhts.a $(LIBS) -lpthread +ref_cache/ref-cache: $(REF_CACHE_OBJS) + $(CC) $(LDFLAGS) $(REF_CACHE_EXTRA_LD_FLAGS) -o $@ $(REF_CACHE_OBJS) -lcurl + annot-tsv.o: annot-tsv.c config.h $(htslib_hts_h) $(htslib_hts_defs_h) $(htslib_khash_str2int_h) $(htslib_kstring_h) $(htslib_kseq_h) $(htslib_bgzf_h) $(htslib_regidx_h) $(textutils_internal_h) bgzip.o: bgzip.c config.h $(htslib_bgzf_h) $(htslib_hts_h) $(htslib_hfile_h) htsfile.o: htsfile.c config.h $(htslib_hfile_h) $(htslib_hts_h) $(htslib_sam_h) $(htslib_vcf_h) tabix.o: tabix.c config.h $(htslib_tbx_h) $(htslib_sam_h) $(htslib_vcf_h) $(htslib_kseq_h) $(htslib_bgzf_h) $(htslib_hts_h) $(htslib_regidx_h) $(htslib_hts_defs_h) $(htslib_hts_log_h) $(htslib_thread_pool_h) +# ref_cache dependencies +ref_cache_cmsg_wrap_h = ref_cache/cmsg_wrap.h +ref_cache_http_parser_h = ref_cache/http_parser.h $(htslib_hts_defs_h) $(ref_cache_types_h) +ref_cache_listener_h = ref_cache/listener.h $(ref_cache_types_h) +ref_cache_log_files_h = ref_cache/log_files.h $(htslib_hts_defs_h) $(ref_types_h) +ref_cache_misc_h = ref_cache/misc.h $(htslib_hts_defs_h) +ref_cache_options_h = ref_cache/options.h $(ref_cache_types_h) +ref_cache_ping_h = ref_cache/ping.h +ref_cache_poll_wrap_h = ref_cache/poll_wrap.h $(ref_cache_types_h) +ref_cache_ref_files_h = ref_cache/ref_files.h $(htslib_hts_defs_h) $(ref_cache_types_h) +ref_cache_request_handler_h = ref_cache/request_handler.h $(htslib_hts_defs_h) $(ref_cache_types_h) +ref_cache_sendfile_wrap_h = ref_cache/sendfile_wrap.h +ref_cache_server_h = ref_cache/server.h $(htslib_hts_defs_h) $(ref_cache_types_h) +ref_cache_transaction_h = ref_cache/transaction.h $(htslib_hts_defs_h) $(ref_cache_types_h) +ref_cache_types_h = ref_cache/types.h +ref_cache_upstream_h = ref_cache/upstream.h $(ref_cache_types_h) + +ref_cache/cmsg_wrap.o: ref_cache/cmsg_wrap.c config.h $(ref_cache_cmsg_wrap_h) +ref_cache/http_parser.o: ref_cache/http_parser.c config.h $(ref_cache_http_parser_h) $(ref_cache_misc_h) $(ref_cache_options_h) $(ref_cache_request_handler_h) $(ref_cache_server_h) cram/pooled_alloc.h +ref_cache/listener.o: ref_cache/listener.c config.h $(ref_cache_listener_h) $(ref_cache_misc_h) $(ref_cache_options_h) $(ref_cache_poll_wrap_h) +ref_cache/log_files.o: ref_cache/log_files.c config.h $(ref_cache_log_files_h) $(ref_cache_options_h) +ref_cache/main.o: ref_cache/main.c config.h $(ref_cache_listener_h) $(ref_cache_log_files_h) $(ref_cache_misc_h) $(ref_cache_options_h) $(ref_cache_ping_h) $(ref_cache_poll_wrap_h) $(ref_cache_server_h) $(ref_cache_upstream_h) +ref_cache/ping.o: ref_cache/ping.c config.h $(ref_cache_ping_h) $(ref_cache_misc_h) $(ref_cache_options_h) +ref_cache/poll_wrap_epoll.o: ref_cache/poll_wrap_epoll.c config.h $(ref_cache_poll_wrap_h) cram/pooled_alloc.h +ref_cache/poll_wrap_poll.o: ref_cache/poll_wrap_poll.c config.h $(ref_cache_poll_wrap_h) cram/pooled_alloc.h +ref_cache/ref_files.o: ref_cache/ref_files.c config.h $(ref_cache_ref_files_h) $(ref_cache_misc_h) $(ref_cache_options_h) $(ref_cache_upstream_h) +ref_cache/request_handler.o: ref_cache/request_handler.c config.h $(ref_cache_request_handler_h) $(ref_cache_http_parser_h) $(ref_cache_misc_h) $(ref_cache_options_h) $(ref_cache_ref_files_h) $(ref_cache_transaction_h) $(ref_cache_upstream_h) +ref_cache/sendfile_wrap.o: ref_cache/sendfile_wrap.c config.h $(ref_cache_sendfile_wrap_h) +ref_cache/server.o: ref_cache/server.c config.h $(ref_cache_server_h) $(ref_cache_http_parser_h) $(ref_cache_listener_h) $(ref_cache_misc_h) $(ref_cache_options_h) $(ref_cache_poll_wrap_h) $(ref_cache_ref_files_h) $(ref_cache_request_handler_h) $(ref_cache_transaction_h) $(ref_cache_upstream_h) cram/pooled_alloc.h +ref_cache/transaction.o: ref_cache/transaction.c config.h $(ref_cache_transaction_h) $(ref_cache_http_parser_h) $(ref_cache_options_h) $(ref_cache_poll_wrap_h) $(ref_cache_ref_files_h) $(ref_cache_sendfile_wrap_h) $(ref_cache_server_h) cram/pooled_alloc.h +ref_cache/upstream.o: ref_cache/upstream.c config.h $(ref_cache_upstream_h) $(ref_cache_cmsg_wrap_h) $(ref_cache_misc_h) $(ref_cache_options_h) $(ref_cache_poll_wrap_h) $(htslib_hts_defs_h) + # Runes to check that the htscodecs submodule is present ifdef HTSCODECS_SOURCES htscodecs/htscodecs/%.c: | htscodecs/htscodecs @@ -561,7 +627,7 @@ htscodecs/htscodecs: @false # Build the htscodecs/htscodecs/version.h file if necessary -htscodecs/htscodecs/version.h: force +htscodecs/htscodecs/version.h: force | htscodecs/htscodecs @if test -e $(srcdir)/htscodecs/.git && test -e $(srcdir)/htscodecs/configure.ac ; then \ vers=`cd $(srcdir)/htscodecs && git describe --always --dirty --match 'v[0-9]\.[0-9]*'` && \ case "$$vers" in \ @@ -628,7 +694,12 @@ check test: all $(HTSCODECS_TEST_TARGETS) cd test/base_mods && ./base-mods.sh base-mods.tst REF_PATH=: test/sam test/ce.fa test/faidx/faidx.fa test/faidx/fastqs.fq test/test-regidx - cd test && REF_PATH=: ./test.pl $${TEST_OPTS:-} + cd test && \ + if test "x$(BUILT_PLUGINS)" != "x"; then \ + REF_PATH=: HTS_PATH=.. ./with-shlib.sh ./test.pl $(REF_CACHE_TEST_OPTS) $${TEST_OPTS:-} ; \ + else \ + REF_PATH=: ./test.pl $(REF_CACHE_TEST_OPTS) $${TEST_OPTS:-} ; \ + fi test/hts_endian: test/hts_endian.o $(CC) $(LDFLAGS) -o $@ test/hts_endian.o $(LIBS) @@ -796,7 +867,7 @@ test/test_faidx.o: test/test_faidx.c config.h $(htslib_faidx_h) test/test_index.o: test/test_index.c config.h $(htslib_sam_h) $(htslib_vcf_h) test/test-vcf-api.o: test/test-vcf-api.c config.h $(htslib_hts_h) $(htslib_vcf_h) $(htslib_kstring_h) $(htslib_kseq_h) test/test-vcf-sweep.o: test/test-vcf-sweep.c config.h $(htslib_vcf_sweep_h) -test/test-bcf-sr.o: test/test-bcf-sr.c config.h $(htslib_synced_bcf_reader_h) $(htslib_hts_h) $(htslib_vcf_h) +test/test-bcf-sr.o: test/test-bcf-sr.c config.h $(htslib_hts_defs_h) $(htslib_synced_bcf_reader_h) $(htslib_hts_h) $(htslib_vcf_h) test/test-bcf-translate.o: test/test-bcf-translate.c config.h $(htslib_vcf_h) test/test_introspection.o: test/test_introspection.c config.h $(htslib_hts_h) $(htslib_hfile_h) test/test-bcf_set_variant_type.o: test/test-bcf_set_variant_type.c config.h $(htslib_hts_h) vcf.c @@ -874,19 +945,21 @@ $(srcprefix)htslib.map: libhts.so printf '\n%s {\n' "HTSLIB_$$curr_vers" >> $@.new.tmp && \ cat $@.tmp >> $@.new.tmp && \ printf '} %s;\n' "$$last_vers" >> $@.new.tmp && \ - rm -f $@.tmp && \ - mv $@.new.tmp $@ ; \ - fi ; \ + rm -f $@.tmp && \ + mv $@.new.tmp $@ ; \ else \ rm -f $@.tmp ; \ fi -install: libhts.a $(BUILT_PROGRAMS) $(BUILT_PLUGINS) installdirs install-$(SHLIB_FLAVOUR) install-pkgconfig - $(INSTALL_PROGRAM) $(BUILT_PROGRAMS) $(DESTDIR)$(bindir) +install: libhts.a $(BUILT_PROGRAMS) $(BUILT_PLUGINS) $(REF_CACHE_PROGRAMS) installdirs install-$(SHLIB_FLAVOUR) install-pkgconfig + $(INSTALL_PROGRAM) $(BUILT_PROGRAMS) $(REF_CACHE_PROGRAMS) $(DESTDIR)$(bindir) if test -n "$(BUILT_PLUGINS)"; then $(INSTALL_PROGRAM) $(BUILT_PLUGINS) $(DESTDIR)$(plugindir); fi $(INSTALL_DATA) $(SRC)htslib/*.h $(DESTDIR)$(includedir)/htslib $(INSTALL_DATA) libhts.a $(DESTDIR)$(libdir)/libhts.a $(INSTALL_MAN) $(SRC)annot-tsv.1 $(SRC)bgzip.1 $(SRC)htsfile.1 $(SRC)tabix.1 $(DESTDIR)$(man1dir) + if test "x$(REF_CACHE_PROGRAMS)" != "x" ; then \ + $(INSTALL_MAN) $(SRC)ref_cache/ref-cache.1 $(DESTDIR)$(man1dir) ; \ + fi $(INSTALL_MAN) $(SRC)faidx.5 $(SRC)sam.5 $(SRC)vcf.5 $(DESTDIR)$(man5dir) $(INSTALL_MAN) $(SRC)htslib-s3-plugin.7 $(DESTDIR)$(man7dir) @@ -925,12 +998,12 @@ install-pkgconfig: htslib.pc.tmp installdirs # A pkg-config file (suitable for copying to $PKG_CONFIG_PATH) that provides # flags for building against the uninstalled library in this build directory. htslib-uninstalled.pc: htslib.pc.tmp - sed -e 's#@-includedir@#'`pwd`'#g;s#@-libdir@#'`pwd`'#g' htslib.pc.tmp > $@ + sed -e "s#@-includedir@#`pwd`#g;s#@-libdir@#`pwd`#g" htslib.pc.tmp > $@ testclean: -rm -f test/*.tmp test/*.tmp.* test/faidx/*.tmp* \ - test/longrefs/*.tmp.* test/tabix/*.tmp.* \ + test/longrefs/*.tmp.* test/ref_cache/*.tmp.* test/tabix/*.tmp.* \ test/bgzf_boundaries/*.tmp.* test/*/FAIL* \ header-exports.txt shlib-exports-$(SHLIB_FLAVOUR).txt -rm -rf htscodecs/tests/test.out @@ -943,9 +1016,10 @@ mostlyclean: testclean -rm -f htscodecs/htscodecs/*.o htscodecs/htscodecs/*.pico $(DEL_HTSCODECS_VERSION) -rm -f hts-object-files -rm -f htscodecs/tests/*.o + -rm -f ref_cache/*.o clean: mostlyclean clean-$(SHLIB_FLAVOUR) - -rm -f libhts.a $(BUILT_PROGRAMS) $(BUILT_PLUGINS) $(BUILT_TEST_PROGRAMS) $(BUILT_THRASH_PROGRAMS) + -rm -f libhts.a $(BUILT_PROGRAMS) $(BUILT_PLUGINS) $(BUILT_TEST_PROGRAMS) $(BUILT_THRASH_PROGRAMS) $(REF_CACHE_PROGRAMS) -rm -f htscodecs/tests/rans4x8 htscodecs/tests/rans4x16pr htscodecs/tests/arith_dynamic htscodecs/tests/tokenise_name3 htscodecs/tests/fqzcomp_qual htscodecs/tests/varint distclean maintainer-clean: clean @@ -974,11 +1048,11 @@ tags TAGS: # code with your program, this hook enables Automake-style "make dist" # for this subdirectory. If you do bundle an htslib snapshot, please # add identifying information to $(PACKAGE_VERSION) as appropriate. -# (The wildcards attempt to omit non-exported files (.git*, README.md, +# (The wildcards attempt to omit non-exported files (.git, .gitignore, # etc) and other detritus that might be in the top-level directory.) -distdir: +distdir: htscodecs/htscodecs/version.h @if [ -z "$(distdir)" ]; then echo "Please supply a distdir=DIR argument."; false; fi - tar -c *.[ch15] [ILMNRchtv]*[ELSbcekmnth] | (cd $(distdir) && tar -x) + tar -c *.[ch157] [I-R]*[ELSde] [cmors]*[bcemns4] [bhtv]*[bhknpt] htscodecs/*.md htscodecs/[ht]*/*[chp-t0-9] | (cd $(distdir) && tar -x) +cd $(distdir) && $(MAKE) distclean force: diff --git a/htslib/bcf_sr_sort.c b/htslib/bcf_sr_sort.c index 01e98bb3..73be004c 100644 --- a/htslib/bcf_sr_sort.c +++ b/htslib/bcf_sr_sort.c @@ -1,5 +1,5 @@ /* - Copyright (C) 2017-2021 Genome Research Ltd. + Copyright (C) 2017-2021,2024 Genome Research Ltd. Author: Petr Danecek @@ -32,6 +32,7 @@ #include "htslib/khash_str2int.h" #include "htslib/kbitset.h" +// Variant types and pair-wise compatibility of their combinations, see bcf_sr_init_scores() #define SR_REF 1 #define SR_SNP 2 #define SR_INDEL 4 @@ -366,7 +367,7 @@ static int bcf_sr_sort_set(bcf_srs_t *readers, sr_sort_t *srt, const char *chr, // group VCFs into groups, each with a unique combination of variants in the duplicate lines int ireader,ivar,irec,igrp,ivset,iact; for (ireader=0; ireadernreaders; ireader++) srt->vcf_buf[ireader].nrec = 0; - for (iact=0; iactnactive; iact++) + for (iact=0; iactnactive; iact++) // process each of the active readers, ie which still have a record to process { ireader = srt->active[iact]; bcf_sr_t *reader = &readers->readers[ireader]; @@ -384,6 +385,11 @@ static int bcf_sr_sort_set(bcf_srs_t *readers, sr_sort_t *srt, const char *chr, srt->off[srt->noff++] = srt->str.l; size_t beg = srt->str.l; int end_pos = -1; + if ( srt->pair & BCF_SR_PAIR_ID ) + { + kputs(line->d.id,&srt->str); + kputc(':',&srt->str); + } for (ivar=1; ivarn_allele; ivar++) { if ( ivar>1 ) kputc(',',&srt->str); @@ -417,7 +423,10 @@ static int bcf_sr_sort_set(bcf_srs_t *readers, sr_sort_t *srt, const char *chr, } // Create new variant or attach to existing one. But careful, there can be duplicate - // records with the same POS,REF,ALT (e.g. in dbSNP-b142) + // records with the same POS,REF,ALT (e.g. in dbSNP-b142). In such case, use a + // hash table (srt->var_str2int) and a counter (var_idx) to ensure they are + // treated as separate variants, while still allowing them to be matched + // between readers. char *var_str = beg + srt->str.s; int ret, var_idx = 0, var_end = srt->str.l; while ( 1 ) @@ -435,6 +444,7 @@ static int bcf_sr_sort_set(bcf_srs_t *readers, sr_sort_t *srt, const char *chr, } if ( ret==-1 ) { + // the variant is not present, insert ivar = srt->nvar++; hts_expand0(var_t,srt->nvar,srt->mvar,srt->var); srt->var[ivar].nvcf = 0; diff --git a/htslib/bcf_sr_sort.h b/htslib/bcf_sr_sort.h index c8bd787a..447e8bf7 100644 --- a/htslib/bcf_sr_sort.h +++ b/htslib/bcf_sr_sort.h @@ -1,5 +1,5 @@ /* - Copyright (C) 2017 Genome Research Ltd. + Copyright (C) 2017-2019,2024 Genome Research Ltd. Author: Petr Danecek @@ -55,6 +55,9 @@ typedef struct } var_t; +// Group is a set of variants in duplicate records within one VCF. They are identified with a key (used only +// for debugging), such as C>A,C>G;C>T. Commas separate alleles in a multiallelic record, semicolons separate +// VCF lines. typedef struct { char *key; // only for debugging @@ -67,7 +70,7 @@ typedef struct { int nvar, mvar, *var; // list of compatible variants that can be output together int cnt; // number of readers in this group - kbitset_t *mask; // which groups are populated in this set (replace with expandable bitmask) + kbitset_t *mask; // which groups are populated in this set } varset_t; @@ -100,8 +103,13 @@ sr_sort_t; sr_sort_t *bcf_sr_sort_init(sr_sort_t *srt); void bcf_sr_sort_reset(sr_sort_t *srt); int bcf_sr_sort_next(bcf_srs_t *readers, sr_sort_t *srt, const char *chr, hts_pos_t pos); + +// initialize a new position using the i-th reader int bcf_sr_sort_set_active(sr_sort_t *srt, int i); + +// add i-th reader with the same position, assumed bcf_sr_sort_set_active() was called with another reader int bcf_sr_sort_add_active(sr_sort_t *srt, int i); + void bcf_sr_sort_destroy(sr_sort_t *srt); void bcf_sr_sort_remove_reader(bcf_srs_t *readers, sr_sort_t *srt, int i); diff --git a/htslib/bgzf.c b/htslib/bgzf.c index 8092c7b9..8cf3b782 100644 --- a/htslib/bgzf.c +++ b/htslib/bgzf.c @@ -2,7 +2,7 @@ Copyright (c) 2008 Broad Institute / Massachusetts Institute of Technology 2011, 2012 Attractive Chaos - Copyright (C) 2009, 2013-2023 Genome Research Ltd + Copyright (C) 2009, 2013-2025 Genome Research Ltd Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -548,6 +548,10 @@ BGZF *bgzf_hopen(hFILE *hfp, const char *mode) } #ifdef HAVE_LIBDEFLATE +uint32_t hts_crc32(uint32_t crc, const void *buf, size_t len) { + return libdeflate_crc32(crc, buf, len); +} + int bgzf_compress(void *_dst, size_t *dlen, const void *src, size_t slen, int level) { if (slen == 0) { @@ -607,6 +611,10 @@ int bgzf_compress(void *_dst, size_t *dlen, const void *src, size_t slen, int le #else +uint32_t hts_crc32(uint32_t crc, const void *buf, size_t len) { + return crc32(crc, buf, len); +} + int bgzf_compress(void *_dst, size_t *dlen, const void *src, size_t slen, int level) { uint32_t crc; @@ -1350,13 +1358,7 @@ static void *bgzf_encode_level0_func(void *arg) { u16_to_le(~j->uncomp_len, j->comp_data + BLOCK_HEADER_LENGTH + 3); // Trailer (CRC, uncompressed length) -#ifdef HAVE_LIBDEFLATE - crc = libdeflate_crc32(0, j->comp_data + BLOCK_HEADER_LENGTH + 5, - j->uncomp_len); -#else - crc = crc32(crc32(0L, NULL, 0L), - (Bytef*)j->comp_data + BLOCK_HEADER_LENGTH + 5, j->uncomp_len); -#endif + crc = hts_crc32(0, j->comp_data + BLOCK_HEADER_LENGTH + 5, j->uncomp_len); u32_to_le(crc, j->comp_data + j->comp_len - 8); u32_to_le(j->uncomp_len, j->comp_data + j->comp_len - 4); @@ -1584,7 +1586,7 @@ static void bgzf_mt_seek(BGZF *fp) { mt->errcode = 0; if (hseek(fp->fp, mt->block_address, SEEK_SET) < 0) - mt->errcode = BGZF_ERR_IO; + mt->errcode = errno; pthread_mutex_unlock(&mt->job_pool_m); mt->command = SEEK_DONE; @@ -1902,7 +1904,7 @@ static int mt_flush_queue(BGZF *fp) if ((shutdown = hts_tpool_process_is_shutdown(mt->out_queue))) break; pthread_mutex_unlock(&mt->job_pool_m); - usleep(10000); // FIXME: replace by condition variable + hts_usleep(10000); // FIXME: replace by condition variable pthread_mutex_lock(&mt->job_pool_m); } pthread_mutex_unlock(&mt->job_pool_m); @@ -2203,8 +2205,16 @@ static inline int64_t bgzf_seek_common(BGZF* fp, abort(); // Should not get to any other state } } while (fp->mt->command != SEEK_DONE); + fp->mt->command = NONE; + if (fp->mt->errcode) { + fp->errcode |= BGZF_ERR_IO; + errno = fp->mt->errcode; + pthread_mutex_unlock(&fp->mt->command_m); + return -1; + } + fp->block_length = 0; // indicates current block has not been loaded fp->block_address = block_address; fp->block_offset = block_offset; diff --git a/htslib/config.h.in b/htslib/config.h.in index 70c6d341..32ef87ff 100644 --- a/htslib/config.h.in +++ b/htslib/config.h.in @@ -40,6 +40,22 @@ /* Define if you have the Common Crypto library. */ #undef HAVE_COMMONCRYPTO +/* Define to 1 if you have the declaration of 'AI_ADDRCONFIG', and to 0 if you + don't. */ +#undef HAVE_DECL_AI_ADDRCONFIG + +/* Define to 1 if you have the declaration of 'AI_V4MAPPED', and to 0 if you + don't. */ +#undef HAVE_DECL_AI_V4MAPPED + +/* Define to 1 if you have the declaration of 'EHOSTDOWN', and to 0 if you + don't. */ +#undef HAVE_DECL_EHOSTDOWN + +/* Define to 1 if you have the declaration of 'ENONET', and to 0 if you don't. + */ +#undef HAVE_DECL_ENONET + /* Define to 1 if you have the declaration of '__cpuid_count', and to 0 if you don't. */ #undef HAVE_DECL___CPUID_COUNT @@ -51,15 +67,27 @@ /* Define to 1 if you have the 'drand48' function. */ #undef HAVE_DRAND48 +/* Define to 1 if you have the 'elf_aux_info' function. */ +#undef HAVE_ELF_AUX_INFO + +/* Define if epoll is available */ +#undef HAVE_EPOLL + /* Define if using an external libhtscodecs */ #undef HAVE_EXTERNAL_LIBHTSCODECS /* Define to 1 if you have the 'fdatasync' function. */ #undef HAVE_FDATASYNC +/* Define if you have FreeBSD-type sendfile */ +#undef HAVE_FREEBSD_SENDFILE + /* Define to 1 if you have the 'fsync' function. */ #undef HAVE_FSYNC +/* Define to 1 if you have the 'getauxval' function. */ +#undef HAVE_GETAUXVAL + /* Define to 1 if you have the 'getpagesize' function. */ #undef HAVE_GETPAGESIZE @@ -87,15 +115,24 @@ /* Define to 1 if you have the 'z' library (-lz). */ #undef HAVE_LIBZ +/* Define if you have Linux-type sendfile */ +#undef HAVE_LINUX_SENDFILE + /* Define to 1 if you have the header file. */ #undef HAVE_LZMA_H +/* Define if you have macOS-type sendfile */ +#undef HAVE_MACOS_SENDFILE + /* Define to 1 if you have a working 'mmap' system call. */ #undef HAVE_MMAP /* Defined to 1 if rANS source using popcnt can be compiled. */ #undef HAVE_POPCNT +/* Define to 1 if you have the 'posix_memalign' function. */ +#undef HAVE_POSIX_MEMALIGN + /* Define to 1 if you have the 'srand48_deterministic' function. */ #undef HAVE_SRAND48_DETERMINISTIC @@ -153,6 +190,9 @@ /* Platform-dependent plugin filename extension. */ #undef PLUGIN_EXT +/* Define if epoll edge triggering is available */ +#undef PW_HAVE_EDGE + /* Define to 1 if all of the C89 standard headers exist (not just the ones required in a freestanding environment). This macro is provided for backward compatibility; new code need not use it. */ diff --git a/htslib/config.mk.in b/htslib/config.mk.in index 59a121cf..98acb01f 100644 --- a/htslib/config.mk.in +++ b/htslib/config.mk.in @@ -118,3 +118,9 @@ endif HTS_CFLAGS_AVX2 = @hts_cflags_avx2@ HTS_CFLAGS_AVX512 = @hts_cflags_avx512@ HTS_CFLAGS_SSE4 = @hts_cflags_sse4@ + +# Optional ref-cache program +REF_CACHE_PROGRAMS = @hts_ref_cache_programs@ +REF_CACHE_EXTRA_C_FLAGS = @hts_paranoia_c_flags@ +REF_CACHE_EXTRA_LD_FLAGS = @hts_paranoia_ld_flags@ +REF_CACHE_TEST_OPTS = @hts_ref_cache_test_opts@ diff --git a/htslib/configure b/htslib/configure index 1a6ec7d3..87f01330 100755 --- a/htslib/configure +++ b/htslib/configure @@ -1,6 +1,6 @@ #! /bin/sh # Guess values for system-dependent variables and create Makefiles. -# Generated by GNU Autoconf 2.72 for HTSlib 1.21. +# Generated by GNU Autoconf 2.72 for HTSlib 1.22. # # Report bugs to . # @@ -609,8 +609,8 @@ MAKEFLAGS= # Identity of this package. PACKAGE_NAME='HTSlib' PACKAGE_TARNAME='htslib' -PACKAGE_VERSION='1.21' -PACKAGE_STRING='HTSlib 1.21' +PACKAGE_VERSION='1.22' +PACKAGE_STRING='HTSlib 1.22' PACKAGE_BUGREPORT='samtools-help@lists.sourceforge.net' PACKAGE_URL='http://www.htslib.org/' @@ -658,6 +658,10 @@ private_LIBS pc_requires CRYPTO_LIBS s3 +hts_ref_cache_test_opts +hts_paranoia_ld_flags +hts_paranoia_c_flags +hts_ref_cache_programs gcs libcurl PLUGIN_EXT @@ -745,6 +749,7 @@ with_external_htscodecs with_libdeflate with_plugin_dir with_plugin_path +enable_ref_cache enable_s3 enable_year2038 ' @@ -1307,7 +1312,7 @@ if test "$ac_init_help" = "long"; then # Omit some internal or obsolete options to make the list less imposing. # This message is too long to be a string in the A/UX 3.1 sh. cat <<_ACEOF -'configure' configures HTSlib 1.21 to adapt to many kinds of systems. +'configure' configures HTSlib 1.22 to adapt to many kinds of systems. Usage: $0 [OPTION]... [VAR=VALUE]... @@ -1373,7 +1378,7 @@ fi if test -n "$ac_init_help"; then case $ac_init_help in - short | recursive ) echo "Configuration of HTSlib 1.21:";; + short | recursive ) echo "Configuration of HTSlib 1.22:";; esac cat <<\_ACEOF @@ -1391,6 +1396,7 @@ Optional Features: --enable-libcurl enable libcurl-based support for http/https/etc URLs --disable-lzma omit support for LZMA-compressed CRAM files --enable-plugins enable separately-compiled plugins for file access + --disable-ref-cache build CRAM reference caching proxy --enable-s3 support Amazon AWS S3 URLs --enable-year2038 support timestamps after 2038 @@ -1485,7 +1491,7 @@ fi test -n "$ac_init_help" && exit $ac_status if $ac_init_version; then cat <<\_ACEOF -HTSlib configure 1.21 +HTSlib configure 1.22 generated by GNU Autoconf 2.72 Copyright (C) 2023 Free Software Foundation, Inc. @@ -1812,7 +1818,7 @@ cat >config.log <<_ACEOF This file contains any messages produced by compilers while running configure, to aid debugging if configure makes a mistake. -It was created by HTSlib $as_me 1.21, which was +It was created by HTSlib $as_me 1.22, which was generated by GNU Autoconf 2.72. Invocation command line was $ $0$ac_configure_args_raw @@ -2655,6 +2661,31 @@ ac_config_headers="$ac_config_headers config.h" +# SYNOPSIS +# +# HTS_TEST_CC_FLAG(FLAG, FOUND_VAR, REQUIRE_SILENCE) +# +# Test if FLAG can be used on CFLAGS. It it works, +# variable FOUND_VAR is set to FLAG. If REQUIRE_SILENCE is "yes", +# only pass if the compilation did not produce any diagnostics (needed +# to deal with compilers that accept unknown options, generate +# warnings about them but don't exit non-zero, thus breaking the test). + + + +# SYNOPSIS +# +# HTS_TEST_CC_LD_FLAG(FLAG, FOUND_VAR, REQUIRE_SILENCE) +# +# Test if FLAG can be used on LDFLAGS. It it works, +# variable FOUND_VAR is set to FLAG. If REQUIRE_SILENCE is "yes", +# only pass if the compilation did not produce any diagnostics (needed +# to deal with compilers that accept unknown options, generate +# warnings about them but don't exit non-zero, thus breaking the test). + + + + # pkg.m4 - Macros to locate and use pkg-config. -*- Autoconf -*- @@ -4154,10 +4185,11 @@ fi # HTSlib uses X/Open-only facilities (M_SQRT2 etc, drand48() etc), and # various POSIX functions that are provided by various _POSIX_C_SOURCE values -# or by _XOPEN_SOURCE >= 500. It also uses usleep(), which is removed when -# _XOPEN_SOURCE >= 700. Additionally, some definitions may require +# or by _XOPEN_SOURCE >= 500. Additionally, some definitions may require # _XOPEN_SOURCE >= 600 on some platforms (snprintf on MinGW, -# PTHREAD_MUTEX_RECURSIVE on some Linux distributions). Hence we set it to 600. +# PTHREAD_MUTEX_RECURSIVE on some Linux distributions), and ref-cache uses +# openat(), mkdirat() etc. which need _XOPEN_SOURCE >= 700. +# Hence we set it to 700. # Define _XOPEN_SOURCE unless the user has already done so via $CPPFLAGS etc. @@ -4279,7 +4311,7 @@ then : else case e in #( e) -printf "%s\n" "#define _XOPEN_SOURCE 600" >>confdefs.h +printf "%s\n" "#define _XOPEN_SOURCE 700" >>confdefs.h ;; esac fi @@ -5172,6 +5204,16 @@ fi pluginpath=$with_plugin_path +# Check whether --enable-ref-cache was given. +if test ${enable_ref_cache+y} +then : + enableval=$enable_ref_cache; +else case e in #( + e) enable_ref_cache=check ;; +esac +fi + + # Check whether --enable-s3 was given. if test ${enable_s3+y} then : @@ -5656,6 +5698,24 @@ then : printf "%s\n" "#define HAVE_SRAND48_DETERMINISTIC 1" >>confdefs.h fi +ac_fn_c_check_func "$LINENO" "getauxval" "ac_cv_func_getauxval" +if test "x$ac_cv_func_getauxval" = xyes +then : + printf "%s\n" "#define HAVE_GETAUXVAL 1" >>confdefs.h + +fi +ac_fn_c_check_func "$LINENO" "elf_aux_info" "ac_cv_func_elf_aux_info" +if test "x$ac_cv_func_elf_aux_info" = xyes +then : + printf "%s\n" "#define HAVE_ELF_AUX_INFO 1" >>confdefs.h + +fi +ac_fn_c_check_func "$LINENO" "posix_memalign" "ac_cv_func_posix_memalign" +if test "x$ac_cv_func_posix_memalign" = xyes +then : + printf "%s\n" "#define HAVE_POSIX_MEMALIGN 1" >>confdefs.h + +fi # Darwin has a dubious fdatasync() symbol, but no declaration in @@ -6870,6 +6930,1308 @@ fi fi fi +ref_cache=disabled +if test "x$enable_ref_cache" != xno +then : + case $PLATFORM in #( + Darwin | default) : + + if test "x$libcurl" = xenabled +then : + ref_cache="enabled" +else case e in #( + e) + if test "x$enable_ref_cache" = check +then : + + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: WARNING: ref-cache not enabled: requires libcurl" >&5 +printf "%s\n" "$as_me: WARNING: ref-cache not enabled: requires libcurl" >&2;} + +else case e in #( + e) + cat > config.mk <<'EOF' +ifneq ($(MAKECMDGOALS),distclean) +$(error Resolve configure error first) +endif +EOF + as_fn_error $? "ref-cache not enabled + +The ref-cache program requires libcurl support to be enabled in HTSlib. +Configure with --enable-libcurl in order to build ref-cache." "$LINENO" 5 + ;; +esac +fi + ;; +esac +fi + ;; #( + *) : + if test "x$enable_ref_cache" = xcheck +then : + + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: WARNING: ref-cache not enabled: unsupported platform" >&5 +printf "%s\n" "$as_me: WARNING: ref-cache not enabled: unsupported platform" >&2;} + +else case e in #( + e) + cat > config.mk <<'EOF' +ifneq ($(MAKECMDGOALS),distclean) +$(error Resolve configure error first) +endif +EOF + as_fn_error $? "ref-cache not enabled + +ref-cache is not supported on this platform. Supported platforms include +Linux, FreeBSD, MacOS, and other BSD derivatives" "$LINENO" 5 + ;; +esac +fi + ;; +esac + +fi + +# Check how to get a working cmsg interface +if test "$ref_cache" = enabled +then : + + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for CMSG_LEN" >&5 +printf %s "checking for CMSG_LEN... " >&6; } + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + + #if defined(_XOPEN_SOURCE) +# undef _XOPEN_SOURCE +#endif +#if defined(_POSIX_C_SOURCE) +# undef _POSIX_C_SOURCE +#endif +#include +#include +#include + +int +main (void) +{ + +struct msghdr msg; +char buf[CMSG_SPACE(sizeof(int))]; +struct cmsghdr *cmsg; +unsigned char *fdptr; +msg.msg_control = buf; +msg.msg_controllen = sizeof(buf); +cmsg = CMSG_FIRSTHDR(&msg); +cmsg->cmsg_level = SOL_SOCKET; +cmsg->cmsg_type = SCM_RIGHTS; +cmsg->cmsg_len = CMSG_LEN(sizeof(int)); +fdptr = CMSG_DATA(cmsg); +return fdptr != NULL && cmsg->cmsg_len > 0 && msg.msg_control != NULL; + + ; + return 0; +} +_ACEOF +if ac_fn_c_try_compile "$LINENO" +then : + + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: yes" >&5 +printf "%s\n" "yes" >&6; } + +else case e in #( + e) + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: no" >&5 +printf "%s\n" "no" >&6; } + ref_cache="disabled" + if test "x$enable_ref_cache" = xcheck +then : + + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: WARNING: ref-cache not enabled: missing CMSG_LEN()" >&5 +printf "%s\n" "$as_me: WARNING: ref-cache not enabled: missing CMSG_LEN()" >&2;} + +else case e in #( + e) + cat > config.mk <<'EOF' +ifneq ($(MAKECMDGOALS),distclean) +$(error Resolve configure error first) +endif +EOF + as_fn_error $? "ref-cache not enabled + +ref-cache is not supported on this configuration, as the CMSG_LEN() interface +cannot be found." "$LINENO" 5 + ;; +esac +fi + ;; +esac +fi +rm -f core conftest.err conftest.$ac_objext conftest.beam conftest.$ac_ext + +fi + +hts_paranoia_c_flags="" +hts_paranoia_ld_flags="" +if test "$ref_cache" = enabled +then : + + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for epoll" >&5 +printf %s "checking for epoll... " >&6; } + hts_have_epoll=0 + hts_have_edge_trigger=0 + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +#include +int +main (void) +{ + +struct epoll_event ev; +int fd = epoll_create(0); +ev.events = EPOLLIN | EPOLLET; +ev.data.fd = 0; +if (epoll_ctl(fd, EPOLL_CTL_ADD, 0, &ev) == -1) { + return 1; +} +return 0; + ; + return 0; +} +_ACEOF +if ac_fn_c_try_link "$LINENO" +then : + + hts_have_epoll=1 + hts_have_edge_trigger=1 + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: yes (edge triggered)" >&5 +printf "%s\n" "yes (edge triggered)" >&6; } + +else case e in #( + e) cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +#include +int +main (void) +{ + +struct epoll_event ev; +int fd = epoll_create(0); +ev.events = EPOLLIN; +ev.data.fd = 0; +if (epoll_ctl(fd, EPOLL_CTL_ADD, 0, &ev) == -1) { + return 1; +} +return 0; + ; + return 0; +} +_ACEOF +if ac_fn_c_try_link "$LINENO" +then : + + hts_have_epoll=1 + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: yes" >&5 +printf "%s\n" "yes" >&6; } + +else case e in #( + e) { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: no" >&5 +printf "%s\n" "no" >&6; } ;; +esac +fi +rm -f core conftest.err conftest.$ac_objext conftest.beam \ + conftest$ac_exeext conftest.$ac_ext + ;; +esac +fi +rm -f core conftest.err conftest.$ac_objext conftest.beam \ + conftest$ac_exeext conftest.$ac_ext + +printf "%s\n" "#define HAVE_EPOLL $hts_have_epoll" >>confdefs.h + + +printf "%s\n" "#define PW_HAVE_EDGE $hts_have_edge_trigger" >>confdefs.h + + + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for sendfile" >&5 +printf %s "checking for sendfile... " >&6; } + hts_have_sendfile=no + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +#include +int +main (void) +{ + +#if defined(__linux__) +off_t offset = 0; +return sendfile(1, 0, &offset, 1000) < 0; +#else +deliberately fail +#endif + + ; + return 0; +} +_ACEOF +if ac_fn_c_try_link "$LINENO" +then : + + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: yes, Linux-style" >&5 +printf "%s\n" "yes, Linux-style" >&6; } + +printf "%s\n" "#define HAVE_LINUX_SENDFILE 1" >>confdefs.h + + hts_have_sendfile=linux + +fi +rm -f core conftest.err conftest.$ac_objext conftest.beam \ + conftest$ac_exeext conftest.$ac_ext + if test $hts_have_sendfile = no +then : + + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +#if defined(_XOPEN_SOURCE) +# undef _XOPEN_SOURCE +#endif +#if defined(_POSIX_C_SOURCE) +# undef _POSIX_C_SOURCE +#endif +#include +#include +#include +#include +int +main (void) +{ + +#if (defined(__FreeBSD__) && __FreeBSD__ >= 4) || defined(__DragonFly__) +off_t sbytes = 0, offset = 0; +struct sf_hdtr hdtr = { NULL, 0, NULL, 0 }; +return sendfile(0, 1, offset, 1000, &hdtr, &sbytes, 0) < 0; +#else +deliberately fail +#endif + + ; + return 0; +} +_ACEOF +if ac_fn_c_try_link "$LINENO" +then : + + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: yes, FreeBSD-style" >&5 +printf "%s\n" "yes, FreeBSD-style" >&6; } + +printf "%s\n" "#define HAVE_FREEBSD_SENDFILE 1" >>confdefs.h + + hts_have_sendfile=freebsd + +fi +rm -f core conftest.err conftest.$ac_objext conftest.beam \ + conftest$ac_exeext conftest.$ac_ext + +fi + if test $hts_have_sendfile = no +then : + + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +#if defined(_XOPEN_SOURCE) +# undef _XOPEN_SOURCE +#endif +#if defined(_POSIX_C_SOURCE) +# undef _POSIX_C_SOURCE +#endif +#if !defined(_DARWIN_C_SOURCE) +#define _DARWIN_C_SOURCE +#endif +#include +#include +#include +#include +int +main (void) +{ + +#if defined(__APPLE__) && defined(__MACH__) +off_t len = 1000, offset = 0; +struct sf_hdtr hdtr = { NULL, 0, NULL, 0 }; +return sendfile(0, 1, offset, &len, &hdtr, 0) < 0; +#else +deliberately fail +#endif + + ; + return 0; +} +_ACEOF +if ac_fn_c_try_link "$LINENO" +then : + + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: yes, macOS-style" >&5 +printf "%s\n" "yes, macOS-style" >&6; } + +printf "%s\n" "#define HAVE_MACOS_SENDFILE 1" >>confdefs.h + + hts_have_sendfile=macos +fi +rm -f core conftest.err conftest.$ac_objext conftest.beam \ + conftest$ac_exeext conftest.$ac_ext + +fi + if test $hts_have_sendfile = no +then : + + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: no" >&5 +printf "%s\n" "no" >&6; } + +fi + + ac_fn_check_decl "$LINENO" "EHOSTDOWN" "ac_cv_have_decl_EHOSTDOWN" "#include +" "$ac_c_undeclared_builtin_options" "CFLAGS" +if test "x$ac_cv_have_decl_EHOSTDOWN" = xyes +then : + ac_have_decl=1 +else case e in #( + e) ac_have_decl=0 ;; +esac +fi +printf "%s\n" "#define HAVE_DECL_EHOSTDOWN $ac_have_decl" >>confdefs.h +ac_fn_check_decl "$LINENO" "ENONET" "ac_cv_have_decl_ENONET" "#include +" "$ac_c_undeclared_builtin_options" "CFLAGS" +if test "x$ac_cv_have_decl_ENONET" = xyes +then : + ac_have_decl=1 +else case e in #( + e) ac_have_decl=0 ;; +esac +fi +printf "%s\n" "#define HAVE_DECL_ENONET $ac_have_decl" >>confdefs.h + + ac_fn_check_decl "$LINENO" "AI_V4MAPPED" "ac_cv_have_decl_AI_V4MAPPED" "#include +" "$ac_c_undeclared_builtin_options" "CFLAGS" +if test "x$ac_cv_have_decl_AI_V4MAPPED" = xyes +then : + ac_have_decl=1 +else case e in #( + e) ac_have_decl=0 ;; +esac +fi +printf "%s\n" "#define HAVE_DECL_AI_V4MAPPED $ac_have_decl" >>confdefs.h +ac_fn_check_decl "$LINENO" "AI_ADDRCONFIG" "ac_cv_have_decl_AI_ADDRCONFIG" "#include +" "$ac_c_undeclared_builtin_options" "CFLAGS" +if test "x$ac_cv_have_decl_AI_ADDRCONFIG" = xyes +then : + ac_have_decl=1 +else case e in #( + e) ac_have_decl=0 ;; +esac +fi +printf "%s\n" "#define HAVE_DECL_AI_ADDRCONFIG $ac_have_decl" >>confdefs.h + + + # Test various compiler options, recommended by + # https://github.com/ossf/wg-best-practices-os-developers/blob/main/docs/Compiler-Hardening-Guides/Compiler-Options-Hardening-Guide-for-C-and-C%2B%2B.md + if test "x$GCC" = "xyes" +then : + + tmp_cflags="$CFLAGS" + CFLAGS="-Werror -Wthis-is-really-not-a-vaild-option" + + tmp_desire_silence=no + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + +int +main (void) +{ + + ; + return 0; +} +_ACEOF +if ac_fn_c_try_compile "$LINENO" +then : + tmp_desire_silence=yes +fi +rm -f core conftest.err conftest.$ac_objext conftest.beam conftest.$ac_ext + + + tmp_flag="" + CFLAGS="-Werror" + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether the compiler accepts -Wextra" >&5 +printf %s "checking whether the compiler accepts -Wextra... " >&6; } +if test ${hts_cv_check__Wextra+y} +then : + printf %s "(cached) " >&6 +else case e in #( + e) ac_check_save_cflags=$CFLAGS + CFLAGS="$CFLAGS -Wextra" + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + +int +main (void) +{ + + ; + return 0; +} +_ACEOF +if ac_fn_c_try_compile "$LINENO" +then : + if test "x$tmp_desire_silence" = "xyes" && test -s conftest.err +then : + hts_cv_check__Wextra=no +else case e in #( + e) hts_cv_check__Wextra=yes + if test "xtmp_flag" != x +then : + eval tmp_flag="-Wextra" +fi ;; +esac +fi +else case e in #( + e) hts_cv_check__Wextra=no ;; +esac +fi +rm -f core conftest.err conftest.$ac_objext conftest.beam conftest.$ac_ext + CFLAGS=$ac_check_save_cflags ;; +esac +fi +{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $hts_cv_check__Wextra" >&5 +printf "%s\n" "$hts_cv_check__Wextra" >&6; } + + if test "x$tmp_flag" != x +then : + hts_paranoia_c_flags="$hts_paranoia_c_flags $tmp_flag" +fi + + tmp_flag="" + CFLAGS="-Werror" + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether the compiler accepts -Wformat" >&5 +printf %s "checking whether the compiler accepts -Wformat... " >&6; } +if test ${hts_cv_check__Wformat+y} +then : + printf %s "(cached) " >&6 +else case e in #( + e) ac_check_save_cflags=$CFLAGS + CFLAGS="$CFLAGS -Wformat" + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + +int +main (void) +{ + + ; + return 0; +} +_ACEOF +if ac_fn_c_try_compile "$LINENO" +then : + if test "x$tmp_desire_silence" = "xyes" && test -s conftest.err +then : + hts_cv_check__Wformat=no +else case e in #( + e) hts_cv_check__Wformat=yes + if test "xtmp_flag" != x +then : + eval tmp_flag="-Wformat" +fi ;; +esac +fi +else case e in #( + e) hts_cv_check__Wformat=no ;; +esac +fi +rm -f core conftest.err conftest.$ac_objext conftest.beam conftest.$ac_ext + CFLAGS=$ac_check_save_cflags ;; +esac +fi +{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $hts_cv_check__Wformat" >&5 +printf "%s\n" "$hts_cv_check__Wformat" >&6; } + + if test "x$tmp_flag" != x +then : + hts_paranoia_c_flags="$hts_paranoia_c_flags $tmp_flag" +fi + + tmp_flag="" + CFLAGS="-Werror" + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether the compiler accepts -Wformat=2" >&5 +printf %s "checking whether the compiler accepts -Wformat=2... " >&6; } +if test ${hts_cv_check__Wformat_2+y} +then : + printf %s "(cached) " >&6 +else case e in #( + e) ac_check_save_cflags=$CFLAGS + CFLAGS="$CFLAGS -Wformat=2" + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + +int +main (void) +{ + + ; + return 0; +} +_ACEOF +if ac_fn_c_try_compile "$LINENO" +then : + if test "x$tmp_desire_silence" = "xyes" && test -s conftest.err +then : + hts_cv_check__Wformat_2=no +else case e in #( + e) hts_cv_check__Wformat_2=yes + if test "xtmp_flag" != x +then : + eval tmp_flag="-Wformat=2" +fi ;; +esac +fi +else case e in #( + e) hts_cv_check__Wformat_2=no ;; +esac +fi +rm -f core conftest.err conftest.$ac_objext conftest.beam conftest.$ac_ext + CFLAGS=$ac_check_save_cflags ;; +esac +fi +{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $hts_cv_check__Wformat_2" >&5 +printf "%s\n" "$hts_cv_check__Wformat_2" >&6; } + + if test "x$tmp_flag" != x +then : + hts_paranoia_c_flags="$hts_paranoia_c_flags $tmp_flag" +fi + + tmp_flag="" + CFLAGS="-Werror" + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether the compiler accepts -Wconversion" >&5 +printf %s "checking whether the compiler accepts -Wconversion... " >&6; } +if test ${hts_cv_check__Wconversion+y} +then : + printf %s "(cached) " >&6 +else case e in #( + e) ac_check_save_cflags=$CFLAGS + CFLAGS="$CFLAGS -Wconversion" + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + +int +main (void) +{ + + ; + return 0; +} +_ACEOF +if ac_fn_c_try_compile "$LINENO" +then : + if test "x$tmp_desire_silence" = "xyes" && test -s conftest.err +then : + hts_cv_check__Wconversion=no +else case e in #( + e) hts_cv_check__Wconversion=yes + if test "xtmp_flag" != x +then : + eval tmp_flag="-Wconversion" +fi ;; +esac +fi +else case e in #( + e) hts_cv_check__Wconversion=no ;; +esac +fi +rm -f core conftest.err conftest.$ac_objext conftest.beam conftest.$ac_ext + CFLAGS=$ac_check_save_cflags ;; +esac +fi +{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $hts_cv_check__Wconversion" >&5 +printf "%s\n" "$hts_cv_check__Wconversion" >&6; } + + if test "x$tmp_flag" != x +then : + hts_paranoia_c_flags="$hts_paranoia_c_flags $tmp_flag" +fi + + tmp_flag="" + CFLAGS="-Werror" + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether the compiler accepts -Wtrampolines" >&5 +printf %s "checking whether the compiler accepts -Wtrampolines... " >&6; } +if test ${hts_cv_check__Wtrampolines+y} +then : + printf %s "(cached) " >&6 +else case e in #( + e) ac_check_save_cflags=$CFLAGS + CFLAGS="$CFLAGS -Wtrampolines" + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + +int +main (void) +{ + + ; + return 0; +} +_ACEOF +if ac_fn_c_try_compile "$LINENO" +then : + if test "x$tmp_desire_silence" = "xyes" && test -s conftest.err +then : + hts_cv_check__Wtrampolines=no +else case e in #( + e) hts_cv_check__Wtrampolines=yes + if test "xtmp_flag" != x +then : + eval tmp_flag="-Wtrampolines" +fi ;; +esac +fi +else case e in #( + e) hts_cv_check__Wtrampolines=no ;; +esac +fi +rm -f core conftest.err conftest.$ac_objext conftest.beam conftest.$ac_ext + CFLAGS=$ac_check_save_cflags ;; +esac +fi +{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $hts_cv_check__Wtrampolines" >&5 +printf "%s\n" "$hts_cv_check__Wtrampolines" >&6; } + + if test "x$tmp_flag" != x +then : + hts_paranoia_c_flags="$hts_paranoia_c_flags $tmp_flag" +fi + + tmp_flag="" + CFLAGS="-Werror" + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether the compiler accepts -Wstrict-aliasing" >&5 +printf %s "checking whether the compiler accepts -Wstrict-aliasing... " >&6; } +if test ${hts_cv_check__Wstrict_aliasing+y} +then : + printf %s "(cached) " >&6 +else case e in #( + e) ac_check_save_cflags=$CFLAGS + CFLAGS="$CFLAGS -Wstrict-aliasing" + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + +int +main (void) +{ + + ; + return 0; +} +_ACEOF +if ac_fn_c_try_compile "$LINENO" +then : + if test "x$tmp_desire_silence" = "xyes" && test -s conftest.err +then : + hts_cv_check__Wstrict_aliasing=no +else case e in #( + e) hts_cv_check__Wstrict_aliasing=yes + if test "xtmp_flag" != x +then : + eval tmp_flag="-Wstrict-aliasing" +fi ;; +esac +fi +else case e in #( + e) hts_cv_check__Wstrict_aliasing=no ;; +esac +fi +rm -f core conftest.err conftest.$ac_objext conftest.beam conftest.$ac_ext + CFLAGS=$ac_check_save_cflags ;; +esac +fi +{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $hts_cv_check__Wstrict_aliasing" >&5 +printf "%s\n" "$hts_cv_check__Wstrict_aliasing" >&6; } + + if test "x$tmp_flag" != x +then : + hts_paranoia_c_flags="$hts_paranoia_c_flags $tmp_flag" +fi + + tmp_flag="" + CFLAGS="-Werror" + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether the compiler accepts -fstack-clash-protection" >&5 +printf %s "checking whether the compiler accepts -fstack-clash-protection... " >&6; } +if test ${hts_cv_check__fstack_clash_protection+y} +then : + printf %s "(cached) " >&6 +else case e in #( + e) ac_check_save_cflags=$CFLAGS + CFLAGS="$CFLAGS -fstack-clash-protection" + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + +int +main (void) +{ + + ; + return 0; +} +_ACEOF +if ac_fn_c_try_compile "$LINENO" +then : + if test "x$tmp_desire_silence" = "xyes" && test -s conftest.err +then : + hts_cv_check__fstack_clash_protection=no +else case e in #( + e) hts_cv_check__fstack_clash_protection=yes + if test "xtmp_flag" != x +then : + eval tmp_flag="-fstack-clash-protection" +fi ;; +esac +fi +else case e in #( + e) hts_cv_check__fstack_clash_protection=no ;; +esac +fi +rm -f core conftest.err conftest.$ac_objext conftest.beam conftest.$ac_ext + CFLAGS=$ac_check_save_cflags ;; +esac +fi +{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $hts_cv_check__fstack_clash_protection" >&5 +printf "%s\n" "$hts_cv_check__fstack_clash_protection" >&6; } + + if test "x$tmp_flag" != x +then : + hts_paranoia_c_flags="$hts_paranoia_c_flags $tmp_flag" +fi + + tmp_flag="" + CFLAGS="-Werror" + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether the compiler accepts -fstack-protector-strong" >&5 +printf %s "checking whether the compiler accepts -fstack-protector-strong... " >&6; } +if test ${hts_cv_check__fstack_protector_strong+y} +then : + printf %s "(cached) " >&6 +else case e in #( + e) ac_check_save_cflags=$CFLAGS + CFLAGS="$CFLAGS -fstack-protector-strong" + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + +int +main (void) +{ + + ; + return 0; +} +_ACEOF +if ac_fn_c_try_compile "$LINENO" +then : + if test "x$tmp_desire_silence" = "xyes" && test -s conftest.err +then : + hts_cv_check__fstack_protector_strong=no +else case e in #( + e) hts_cv_check__fstack_protector_strong=yes + if test "xtmp_flag" != x +then : + eval tmp_flag="-fstack-protector-strong" +fi ;; +esac +fi +else case e in #( + e) hts_cv_check__fstack_protector_strong=no ;; +esac +fi +rm -f core conftest.err conftest.$ac_objext conftest.beam conftest.$ac_ext + CFLAGS=$ac_check_save_cflags ;; +esac +fi +{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $hts_cv_check__fstack_protector_strong" >&5 +printf "%s\n" "$hts_cv_check__fstack_protector_strong" >&6; } + + if test "x$tmp_flag" != x +then : + hts_paranoia_c_flags="$hts_paranoia_c_flags $tmp_flag" +fi + + tmp_flag="" + CFLAGS="-Werror" + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether the compiler accepts -fcf-protection=full" >&5 +printf %s "checking whether the compiler accepts -fcf-protection=full... " >&6; } +if test ${hts_cv_check__fcf_protection_full+y} +then : + printf %s "(cached) " >&6 +else case e in #( + e) ac_check_save_cflags=$CFLAGS + CFLAGS="$CFLAGS -fcf-protection=full" + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + +int +main (void) +{ + + ; + return 0; +} +_ACEOF +if ac_fn_c_try_compile "$LINENO" +then : + if test "x$tmp_desire_silence" = "xyes" && test -s conftest.err +then : + hts_cv_check__fcf_protection_full=no +else case e in #( + e) hts_cv_check__fcf_protection_full=yes + if test "xtmp_flag" != x +then : + eval tmp_flag="-fcf-protection=full" +fi ;; +esac +fi +else case e in #( + e) hts_cv_check__fcf_protection_full=no ;; +esac +fi +rm -f core conftest.err conftest.$ac_objext conftest.beam conftest.$ac_ext + CFLAGS=$ac_check_save_cflags ;; +esac +fi +{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $hts_cv_check__fcf_protection_full" >&5 +printf "%s\n" "$hts_cv_check__fcf_protection_full" >&6; } + + if test "x$tmp_flag" != x +then : + hts_paranoia_c_flags="$hts_paranoia_c_flags $tmp_flag" +fi + + tmp_flag="" + CFLAGS="-Werror" + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether the compiler accepts -mbranch-protection=standard" >&5 +printf %s "checking whether the compiler accepts -mbranch-protection=standard... " >&6; } +if test ${hts_cv_check__mbranch_protection_standard+y} +then : + printf %s "(cached) " >&6 +else case e in #( + e) ac_check_save_cflags=$CFLAGS + CFLAGS="$CFLAGS -mbranch-protection=standard" + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + +int +main (void) +{ + + ; + return 0; +} +_ACEOF +if ac_fn_c_try_compile "$LINENO" +then : + if test "x$tmp_desire_silence" = "xyes" && test -s conftest.err +then : + hts_cv_check__mbranch_protection_standard=no +else case e in #( + e) hts_cv_check__mbranch_protection_standard=yes + if test "xtmp_flag" != x +then : + eval tmp_flag="-mbranch-protection=standard" +fi ;; +esac +fi +else case e in #( + e) hts_cv_check__mbranch_protection_standard=no ;; +esac +fi +rm -f core conftest.err conftest.$ac_objext conftest.beam conftest.$ac_ext + CFLAGS=$ac_check_save_cflags ;; +esac +fi +{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $hts_cv_check__mbranch_protection_standard" >&5 +printf "%s\n" "$hts_cv_check__mbranch_protection_standard" >&6; } + + if test "x$tmp_flag" != x +then : + hts_paranoia_c_flags="$hts_paranoia_c_flags $tmp_flag" +fi + + tmp_flag="" + CFLAGS="-Werror" + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether the compiler accepts -fno-delete-null-pointer-checks" >&5 +printf %s "checking whether the compiler accepts -fno-delete-null-pointer-checks... " >&6; } +if test ${hts_cv_check__fno_delete_null_pointer_checks+y} +then : + printf %s "(cached) " >&6 +else case e in #( + e) ac_check_save_cflags=$CFLAGS + CFLAGS="$CFLAGS -fno-delete-null-pointer-checks" + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + +int +main (void) +{ + + ; + return 0; +} +_ACEOF +if ac_fn_c_try_compile "$LINENO" +then : + if test "x$tmp_desire_silence" = "xyes" && test -s conftest.err +then : + hts_cv_check__fno_delete_null_pointer_checks=no +else case e in #( + e) hts_cv_check__fno_delete_null_pointer_checks=yes + if test "xtmp_flag" != x +then : + eval tmp_flag="-fno-delete-null-pointer-checks" +fi ;; +esac +fi +else case e in #( + e) hts_cv_check__fno_delete_null_pointer_checks=no ;; +esac +fi +rm -f core conftest.err conftest.$ac_objext conftest.beam conftest.$ac_ext + CFLAGS=$ac_check_save_cflags ;; +esac +fi +{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $hts_cv_check__fno_delete_null_pointer_checks" >&5 +printf "%s\n" "$hts_cv_check__fno_delete_null_pointer_checks" >&6; } + + if test "x$tmp_flag" != x +then : + hts_paranoia_c_flags="$hts_paranoia_c_flags $tmp_flag" +fi + + CFLAGS="$tmp_cflags" + + + tmp_flag="" + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether the compiler accepts -Wl,-z,noexecstack" >&5 +printf %s "checking whether the compiler accepts -Wl,-z,noexecstack... " >&6; } +if test ${hts_cv_check__Wl__z_noexecstack+y} +then : + printf %s "(cached) " >&6 +else case e in #( + e) ac_check_save_ldflags=$LDFLAGS + LDFLAGS="$LDFLAGS -Wl,-z,noexecstack" + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + +int +main (void) +{ + + ; + return 0; +} +_ACEOF +if ac_fn_c_try_link "$LINENO" +then : + if test "xtmp_desire_silence" = "xyes" && test -s conftest.err +then : + hts_cv_check__Wl__z_noexecstack=no +else case e in #( + e) hts_cv_check__Wl__z_noexecstack=yes + if test "xtmp_flag" != x +then : + eval tmp_flag="-Wl,-z,noexecstack" +fi ;; +esac +fi +else case e in #( + e) hts_cv_check__Wl__z_noexecstack=no ;; +esac +fi +rm -f core conftest.err conftest.$ac_objext conftest.beam \ + conftest$ac_exeext conftest.$ac_ext + LDFLAGS=$ac_check_save_ldflags ;; +esac +fi +{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $hts_cv_check__Wl__z_noexecstack" >&5 +printf "%s\n" "$hts_cv_check__Wl__z_noexecstack" >&6; } + + if test "x$tmp_flag" != x +then : + hts_paranoia_ld_flags="$hts_paranoia_ld_flags $tmp_flag" +fi + + tmp_flag="" + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether the compiler accepts -Wl,-z,relro" >&5 +printf %s "checking whether the compiler accepts -Wl,-z,relro... " >&6; } +if test ${hts_cv_check__Wl__z_relro+y} +then : + printf %s "(cached) " >&6 +else case e in #( + e) ac_check_save_ldflags=$LDFLAGS + LDFLAGS="$LDFLAGS -Wl,-z,relro" + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + +int +main (void) +{ + + ; + return 0; +} +_ACEOF +if ac_fn_c_try_link "$LINENO" +then : + if test "xtmp_desire_silence" = "xyes" && test -s conftest.err +then : + hts_cv_check__Wl__z_relro=no +else case e in #( + e) hts_cv_check__Wl__z_relro=yes + if test "xtmp_flag" != x +then : + eval tmp_flag="-Wl,-z,relro" +fi ;; +esac +fi +else case e in #( + e) hts_cv_check__Wl__z_relro=no ;; +esac +fi +rm -f core conftest.err conftest.$ac_objext conftest.beam \ + conftest$ac_exeext conftest.$ac_ext + LDFLAGS=$ac_check_save_ldflags ;; +esac +fi +{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $hts_cv_check__Wl__z_relro" >&5 +printf "%s\n" "$hts_cv_check__Wl__z_relro" >&6; } + + if test "x$tmp_flag" != x +then : + hts_paranoia_ld_flags="$hts_paranoia_ld_flags $tmp_flag" +fi + + tmp_flag="" + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether the compiler accepts -Wl,-z,now" >&5 +printf %s "checking whether the compiler accepts -Wl,-z,now... " >&6; } +if test ${hts_cv_check__Wl__z_now+y} +then : + printf %s "(cached) " >&6 +else case e in #( + e) ac_check_save_ldflags=$LDFLAGS + LDFLAGS="$LDFLAGS -Wl,-z,now" + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + +int +main (void) +{ + + ; + return 0; +} +_ACEOF +if ac_fn_c_try_link "$LINENO" +then : + if test "xtmp_desire_silence" = "xyes" && test -s conftest.err +then : + hts_cv_check__Wl__z_now=no +else case e in #( + e) hts_cv_check__Wl__z_now=yes + if test "xtmp_flag" != x +then : + eval tmp_flag="-Wl,-z,now" +fi ;; +esac +fi +else case e in #( + e) hts_cv_check__Wl__z_now=no ;; +esac +fi +rm -f core conftest.err conftest.$ac_objext conftest.beam \ + conftest$ac_exeext conftest.$ac_ext + LDFLAGS=$ac_check_save_ldflags ;; +esac +fi +{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $hts_cv_check__Wl__z_now" >&5 +printf "%s\n" "$hts_cv_check__Wl__z_now" >&6; } + + if test "x$tmp_flag" != x +then : + hts_paranoia_ld_flags="$hts_paranoia_ld_flags $tmp_flag" +fi + + tmp_flag="" + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether the compiler accepts -Wl,--as-needed" >&5 +printf %s "checking whether the compiler accepts -Wl,--as-needed... " >&6; } +if test ${hts_cv_check__Wl___as_needed+y} +then : + printf %s "(cached) " >&6 +else case e in #( + e) ac_check_save_ldflags=$LDFLAGS + LDFLAGS="$LDFLAGS -Wl,--as-needed" + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + +int +main (void) +{ + + ; + return 0; +} +_ACEOF +if ac_fn_c_try_link "$LINENO" +then : + if test "xtmp_desire_silence" = "xyes" && test -s conftest.err +then : + hts_cv_check__Wl___as_needed=no +else case e in #( + e) hts_cv_check__Wl___as_needed=yes + if test "xtmp_flag" != x +then : + eval tmp_flag="-Wl,--as-needed" +fi ;; +esac +fi +else case e in #( + e) hts_cv_check__Wl___as_needed=no ;; +esac +fi +rm -f core conftest.err conftest.$ac_objext conftest.beam \ + conftest$ac_exeext conftest.$ac_ext + LDFLAGS=$ac_check_save_ldflags ;; +esac +fi +{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $hts_cv_check__Wl___as_needed" >&5 +printf "%s\n" "$hts_cv_check__Wl___as_needed" >&6; } + + if test "x$tmp_flag" != x +then : + hts_paranoia_ld_flags="$hts_paranoia_ld_flags $tmp_flag" +fi + + tmp_flag="" + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether the compiler accepts -Wl,--no-copy-dt-needed-entries" >&5 +printf %s "checking whether the compiler accepts -Wl,--no-copy-dt-needed-entries... " >&6; } +if test ${hts_cv_check__Wl___no_copy_dt_needed_entries+y} +then : + printf %s "(cached) " >&6 +else case e in #( + e) ac_check_save_ldflags=$LDFLAGS + LDFLAGS="$LDFLAGS -Wl,--no-copy-dt-needed-entries" + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + +int +main (void) +{ + + ; + return 0; +} +_ACEOF +if ac_fn_c_try_link "$LINENO" +then : + if test "xtmp_desire_silence" = "xyes" && test -s conftest.err +then : + hts_cv_check__Wl___no_copy_dt_needed_entries=no +else case e in #( + e) hts_cv_check__Wl___no_copy_dt_needed_entries=yes + if test "xtmp_flag" != x +then : + eval tmp_flag="-Wl,--no-copy-dt-needed-entries" +fi ;; +esac +fi +else case e in #( + e) hts_cv_check__Wl___no_copy_dt_needed_entries=no ;; +esac +fi +rm -f core conftest.err conftest.$ac_objext conftest.beam \ + conftest$ac_exeext conftest.$ac_ext + LDFLAGS=$ac_check_save_ldflags ;; +esac +fi +{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $hts_cv_check__Wl___no_copy_dt_needed_entries" >&5 +printf "%s\n" "$hts_cv_check__Wl___no_copy_dt_needed_entries" >&6; } + + if test "x$tmp_flag" != x +then : + hts_paranoia_ld_flags="$hts_paranoia_ld_flags $tmp_flag" +fi + + case $basic_host in #( + *-OpenBSD*) : + : ;; #( + *) : + tmp_flag="" + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether the compiler accepts -Wl,-z,nodlopen" >&5 +printf %s "checking whether the compiler accepts -Wl,-z,nodlopen... " >&6; } +if test ${hts_cv_check__Wl__z_nodlopen+y} +then : + printf %s "(cached) " >&6 +else case e in #( + e) ac_check_save_ldflags=$LDFLAGS + LDFLAGS="$LDFLAGS -Wl,-z,nodlopen" + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + +int +main (void) +{ + + ; + return 0; +} +_ACEOF +if ac_fn_c_try_link "$LINENO" +then : + if test "xtmp_desire_silence" = "xyes" && test -s conftest.err +then : + hts_cv_check__Wl__z_nodlopen=no +else case e in #( + e) hts_cv_check__Wl__z_nodlopen=yes + if test "xtmp_flag" != x +then : + eval tmp_flag="-Wl,-z,nodlopen" +fi ;; +esac +fi +else case e in #( + e) hts_cv_check__Wl__z_nodlopen=no ;; +esac +fi +rm -f core conftest.err conftest.$ac_objext conftest.beam \ + conftest$ac_exeext conftest.$ac_ext + LDFLAGS=$ac_check_save_ldflags ;; +esac +fi +{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $hts_cv_check__Wl__z_nodlopen" >&5 +printf "%s\n" "$hts_cv_check__Wl__z_nodlopen" >&6; } + ;; +esac + if test "x$tmp_flag" != x +then : + hts_paranoia_ld_flags="$hts_paranoia_ld_flags $tmp_flag" +fi + +fi + + hts_ref_cache_programs="ref_cache/ref-cache" + hts_ref_cache_test_opts="--ref-cache-bin ../ref_cache/ref-cache" + +else case e in #( + e) + hts_ref_cache_programs="" + hts_ref_cache_test_opts="" + ;; +esac +fi + + + + + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for library containing regcomp" >&5 printf %s "checking for library containing regcomp... " >&6; } if test ${ac_cv_search_regcomp+y} @@ -7489,7 +8851,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 # report actual input values of CONFIG_FILES etc. instead of their # values after options handling. ac_log=" -This file was extended by HTSlib $as_me 1.21, which was +This file was extended by HTSlib $as_me 1.22, which was generated by GNU Autoconf 2.72. Invocation command line was CONFIG_FILES = $CONFIG_FILES @@ -7562,7 +8924,7 @@ ac_cs_config_escaped=`printf "%s\n" "$ac_cs_config" | sed "s/^ //; s/'/'\\\\\\\\ cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 ac_cs_config='$ac_cs_config_escaped' ac_cs_version="\\ -HTSlib config.status 1.21 +HTSlib config.status 1.22 configured by $0, generated by GNU Autoconf 2.72, with options \\"\$ac_cs_config\\" @@ -8276,6 +9638,7 @@ printf "%s\n" "$as_me: executing $ac_file commands" >&6;} "mkdir":C) as_dir=cram; as_fn_mkdir_p as_dir=htscodecs/htscodecs; as_fn_mkdir_p as_dir=htscodecs/tests; as_fn_mkdir_p + as_dir=ref_cache; as_fn_mkdir_p as_dir=test/fuzz; as_fn_mkdir_p as_dir=test/longrefs; as_fn_mkdir_p as_dir=test/tabix; as_fn_mkdir_p ;; diff --git a/htslib/configure.ac b/htslib/configure.ac index cdb8391f..c7716ca7 100644 --- a/htslib/configure.ac +++ b/htslib/configure.ac @@ -1,6 +1,6 @@ # Configure script for htslib, a C library for high-throughput sequencing data. # -# Copyright (C) 2015-2024 Genome Research Ltd. +# Copyright (C) 2015-2025 Genome Research Ltd. # # Author: John Marshall # @@ -72,14 +72,15 @@ HTS_PROG_CC_WERROR(hts_late_cflags) # HTSlib uses X/Open-only facilities (M_SQRT2 etc, drand48() etc), and # various POSIX functions that are provided by various _POSIX_C_SOURCE values -# or by _XOPEN_SOURCE >= 500. It also uses usleep(), which is removed when -# _XOPEN_SOURCE >= 700. Additionally, some definitions may require +# or by _XOPEN_SOURCE >= 500. Additionally, some definitions may require # _XOPEN_SOURCE >= 600 on some platforms (snprintf on MinGW, -# PTHREAD_MUTEX_RECURSIVE on some Linux distributions). Hence we set it to 600. +# PTHREAD_MUTEX_RECURSIVE on some Linux distributions), and ref-cache uses +# openat(), mkdirat() etc. which need _XOPEN_SOURCE >= 700. +# Hence we set it to 700. # Define _XOPEN_SOURCE unless the user has already done so via $CPPFLAGS etc. AC_CHECK_DECL([_XOPEN_SOURCE], [], - [AC_DEFINE([_XOPEN_SOURCE], [600], [Specify X/Open requirements])], + [AC_DEFINE([_XOPEN_SOURCE], [700], [Specify X/Open requirements])], []) dnl Check that we have cpuid, and if so run the x86 SIMD checks @@ -273,6 +274,11 @@ AC_ARG_WITH([plugin-path], [with_plugin_path=$with_plugin_dir]) AC_SUBST([pluginpath], $with_plugin_path) +AC_ARG_ENABLE([ref-cache], + [AS_HELP_STRING([--disable-ref-cache], + [build CRAM reference caching proxy])], + [], [enable_ref_cache=check]) + AC_ARG_ENABLE([s3], [AS_HELP_STRING([--enable-s3], [support Amazon AWS S3 URLs])], @@ -335,7 +341,7 @@ HTS_HIDE_DYNAMIC_SYMBOLS dnl FIXME This pulls in dozens of standard header checks AC_FUNC_MMAP -AC_CHECK_FUNCS([gmtime_r fsync drand48 srand48_deterministic]) +AC_CHECK_FUNCS([gmtime_r fsync drand48 srand48_deterministic getauxval elf_aux_info posix_memalign]) # Darwin has a dubious fdatasync() symbol, but no declaration in AC_CHECK_DECL([fdatasync(int)], [AC_CHECK_FUNCS(fdatasync)]) @@ -634,6 +640,250 @@ dnl Only need to add to static_LIBS if not building as a plugin fi fi +ref_cache=disabled +AS_IF([test "x$enable_ref_cache" != xno], + [AS_CASE([$PLATFORM], + [Darwin | default],[ + AS_IF([test "x$libcurl" = xenabled], [ref_cache="enabled"], [ + AS_IF([test "x$enable_ref_cache" = check], [ + AC_MSG_WARN([ref-cache not enabled: requires libcurl]) + ],[ + MSG_ERROR([ref-cache not enabled + +The ref-cache program requires libcurl support to be enabled in HTSlib. +Configure with --enable-libcurl in order to build ref-cache.]) + ]) + ]) + ], + [AS_IF([test "x$enable_ref_cache" = xcheck], [ + AC_MSG_WARN([ref-cache not enabled: unsupported platform]) + ],[ + MSG_ERROR([ref-cache not enabled + +ref-cache is not supported on this platform. Supported platforms include +Linux, FreeBSD, MacOS, and other BSD derivatives]) + ]) + ]) + ]) + +# Check how to get a working cmsg interface +AS_IF([test "$ref_cache" = enabled],[ + AC_MSG_CHECKING([for CMSG_LEN]) + AC_COMPILE_IFELSE([ + AC_LANG_PROGRAM([#if defined(_XOPEN_SOURCE) +# undef _XOPEN_SOURCE +#endif +#if defined(_POSIX_C_SOURCE) +# undef _POSIX_C_SOURCE +#endif +#include +#include +#include +],[[ +struct msghdr msg; +char buf[CMSG_SPACE(sizeof(int))]; +struct cmsghdr *cmsg; +unsigned char *fdptr; +msg.msg_control = buf; +msg.msg_controllen = sizeof(buf); +cmsg = CMSG_FIRSTHDR(&msg); +cmsg->cmsg_level = SOL_SOCKET; +cmsg->cmsg_type = SCM_RIGHTS; +cmsg->cmsg_len = CMSG_LEN(sizeof(int)); +fdptr = CMSG_DATA(cmsg); +return fdptr != NULL && cmsg->cmsg_len > 0 && msg.msg_control != NULL; + ]])], [ + AC_MSG_RESULT([yes]) + ], [ + AC_MSG_RESULT([no]) + ref_cache="disabled" + AS_IF([test "x$enable_ref_cache" = xcheck], [ + AC_MSG_WARN([ref-cache not enabled: missing CMSG_LEN()]) + ],[ + MSG_ERROR([ref-cache not enabled + +ref-cache is not supported on this configuration, as the CMSG_LEN() interface +cannot be found.]) + ]) + ]) +]) + +hts_paranoia_c_flags="" +hts_paranoia_ld_flags="" +AS_IF([test "$ref_cache" = enabled],[ + AC_MSG_CHECKING([for epoll]) + hts_have_epoll=0 + hts_have_edge_trigger=0 + AC_LINK_IFELSE([AC_LANG_PROGRAM([#include ],[ +struct epoll_event ev; +int fd = epoll_create(0); +ev.events = EPOLLIN | EPOLLET; +ev.data.fd = 0; +if (epoll_ctl(fd, EPOLL_CTL_ADD, 0, &ev) == -1) { + return 1; +} +return 0;])], + [ + hts_have_epoll=1 + hts_have_edge_trigger=1 + AC_MSG_RESULT([yes (edge triggered)]) + ], + [AC_LINK_IFELSE([AC_LANG_PROGRAM([#include ],[ +struct epoll_event ev; +int fd = epoll_create(0); +ev.events = EPOLLIN; +ev.data.fd = 0; +if (epoll_ctl(fd, EPOLL_CTL_ADD, 0, &ev) == -1) { + return 1; +} +return 0;])], + [ + hts_have_epoll=1 + AC_MSG_RESULT([yes]) + ], + [AC_MSG_RESULT([no])]) + ]) + AC_DEFINE_UNQUOTED([HAVE_EPOLL], [$hts_have_epoll], + [Define if epoll is available]) + AC_DEFINE_UNQUOTED([PW_HAVE_EDGE], [$hts_have_edge_trigger], + [Define if epoll edge triggering is available]) + + AC_MSG_CHECKING([for sendfile]) + hts_have_sendfile=no + AC_LINK_IFELSE([AC_LANG_PROGRAM([#include ],[ +#if defined(__linux__) +off_t offset = 0; +return sendfile(1, 0, &offset, 1000) < 0; +#else +deliberately fail +#endif + ])], [ + AC_MSG_RESULT([yes, Linux-style]) + AC_DEFINE([HAVE_LINUX_SENDFILE], 1, + [Define if you have Linux-type sendfile]) + hts_have_sendfile=linux + ]) + AS_IF([test $hts_have_sendfile = no], [ + AC_LINK_IFELSE([AC_LANG_PROGRAM([#if defined(_XOPEN_SOURCE) +# undef _XOPEN_SOURCE +#endif +#if defined(_POSIX_C_SOURCE) +# undef _POSIX_C_SOURCE +#endif +#include +#include +#include +#include ],[ +#if (defined(__FreeBSD__) && __FreeBSD__ >= 4) || defined(__DragonFly__) +off_t sbytes = 0, offset = 0; +struct sf_hdtr hdtr = { NULL, 0, NULL, 0 }; +return sendfile(0, 1, offset, 1000, &hdtr, &sbytes, 0) < 0; +#else +deliberately fail +#endif + ])],[ + AC_MSG_RESULT([yes, FreeBSD-style]) + AC_DEFINE([HAVE_FREEBSD_SENDFILE], 1, + [Define if you have FreeBSD-type sendfile]) + hts_have_sendfile=freebsd + ]) + ]) + AS_IF([test $hts_have_sendfile = no], [ + AC_LINK_IFELSE([AC_LANG_PROGRAM([#if defined(_XOPEN_SOURCE) +# undef _XOPEN_SOURCE +#endif +#if defined(_POSIX_C_SOURCE) +# undef _POSIX_C_SOURCE +#endif +#if !defined(_DARWIN_C_SOURCE) +#define _DARWIN_C_SOURCE +#endif +#include +#include +#include +#include ],[ +#if defined(__APPLE__) && defined(__MACH__) +off_t len = 1000, offset = 0; +struct sf_hdtr hdtr = { NULL, 0, NULL, 0 }; +return sendfile(0, 1, offset, &len, &hdtr, 0) < 0; +#else +deliberately fail +#endif + ])], + [ + AC_MSG_RESULT([yes, macOS-style]) + AC_DEFINE([HAVE_MACOS_SENDFILE], 1, + [Define if you have macOS-type sendfile]) + hts_have_sendfile=macos]) + ]) + AS_IF([test $hts_have_sendfile = no], [ + AC_MSG_RESULT([no]) + ]) + + AC_CHECK_DECLS([EHOSTDOWN, ENONET], [], [], [[#include ]]) + AC_CHECK_DECLS([AI_V4MAPPED, AI_ADDRCONFIG], [], [], [[#include ]]) + + # Test various compiler options, recommended by + # https://github.com/ossf/wg-best-practices-os-developers/blob/main/docs/Compiler-Hardening-Guides/Compiler-Options-Hardening-Guide-for-C-and-C%2B%2B.md + AS_IF([test "x$GCC" = "xyes"], + [dnl + dnl Annoyingly, some compilers (e.g. clang) only issue warnings + dnl for unrecognised warning options instead of failing. And even + dnl worse, some (e.g. icc) don't fail even if you use -Werror. + dnl Test for the latter case, and if the compiler behaves that way + dnl be more strict by failing if it doesn't run silently. + + tmp_cflags="$CFLAGS" + CFLAGS="-Werror -Wthis-is-really-not-a-vaild-option" + + tmp_desire_silence=no + AC_COMPILE_IFELSE([AC_LANG_PROGRAM()], [tmp_desire_silence=yes]) + + m4_foreach([cc_opt], [[-Wextra], [-Wformat], [-Wformat=2], [-Wconversion], + [-Wtrampolines], [-Wstrict-aliasing], + [-fstack-clash-protection], + [-fstack-protector-strong], + [-fcf-protection=full], + [-mbranch-protection=standard], + [-fno-delete-null-pointer-checks]],[ + tmp_flag="" + CFLAGS="-Werror" + HTS_TEST_CC_FLAG(cc_opt, [tmp_flag], [$tmp_desire_silence]) + AS_IF([test "x$tmp_flag" != x], + [hts_paranoia_c_flags="$hts_paranoia_c_flags $tmp_flag"]) + ]) + CFLAGS="$tmp_cflags" + + dnl Double quoting needed here, due to commas in the tested options + m4_foreach([ld_opt], [[[-Wl,-z,noexecstack]], + [[-Wl,-z,relro]], [[-Wl,-z,now]], [[-Wl,--as-needed]], + [[-Wl,--no-copy-dt-needed-entries]]],[ + tmp_flag="" + HTS_TEST_CC_LD_FLAG(ld_opt, [tmp_flag], tmp_desire_silence) + AS_IF([test "x$tmp_flag" != x], + [hts_paranoia_ld_flags="$hts_paranoia_ld_flags $tmp_flag"]) + ]) + dnl Skip this test on OpenBSD as the option leads to non-functional + dnl binaries + AS_CASE([$basic_host], + [*-OpenBSD*], [:], + [tmp_flag="" + HTS_TEST_CC_LD_FLAG([[-Wl,-z,nodlopen]], [tmp_flag], tmp_desire_silence)]) + AS_IF([test "x$tmp_flag" != x], + [hts_paranoia_ld_flags="$hts_paranoia_ld_flags $tmp_flag"]) + ]) + + hts_ref_cache_programs="ref_cache/ref-cache" + hts_ref_cache_test_opts="--ref-cache-bin ../ref_cache/ref-cache" +],[ + hts_ref_cache_programs="" + hts_ref_cache_test_opts="" +]) +AC_SUBST([hts_ref_cache_programs]) +AC_SUBST([hts_paranoia_c_flags]) +AC_SUBST([hts_paranoia_ld_flags]) +AC_SUBST([hts_ref_cache_test_opts]) + dnl Look for regcomp in various libraries (needed on windows/mingw). AC_SEARCH_LIBS(regcomp, regex, [libregex=needed], []) @@ -669,6 +919,7 @@ if test "$srcdir" != .; then [AS_MKDIR_P([cram]) AS_MKDIR_P([htscodecs/htscodecs]) AS_MKDIR_P([htscodecs/tests]) + AS_MKDIR_P([ref_cache]) AS_MKDIR_P([test/fuzz]) AS_MKDIR_P([test/longrefs]) AS_MKDIR_P([test/tabix])]) diff --git a/htslib/cram/cram_codecs.c b/htslib/cram/cram_codecs.c index a72419e1..fe454f79 100644 --- a/htslib/cram/cram_codecs.c +++ b/htslib/cram/cram_codecs.c @@ -3613,7 +3613,10 @@ int cram_byte_array_stop_decode_block(cram_slice *slice, cram_codec *c, cp = b->data + b->idx; cp_end = b->data + b->uncomp_size; - stop = c->u.byte_array_stop.stop; + // STOP byte is hard-coded as zero by our name tokeniser decoder + // implementation, so we may ignore what was requested. + stop = b->orig_method == TOK3 ? 0 : c->u.byte_array_stop.stop; + if (cp_end - cp < out->alloc - out->byte) { unsigned char *out_cp = BLOCK_END(out); while (cp != cp_end && *cp != stop) diff --git a/htslib/cram/cram_decode.c b/htslib/cram/cram_decode.c index 2b2ad602..9fb32295 100644 --- a/htslib/cram/cram_decode.c +++ b/htslib/cram/cram_decode.c @@ -2390,8 +2390,11 @@ int cram_decode_slice(cram_fd *fd, cram_container *c, cram_slice *s, if ((fd->required_fields & SAM_SEQ) && s->ref == NULL && s->hdr->ref_seq_id >= 0 && !c->comp_hdr->no_ref) { - hts_log_error("Unable to fetch reference #%d:%"PRId64"-%"PRId64"\n", - ref_id, s->hdr->ref_seq_start, + hts_log_error("Unable to fetch reference %s:%"PRId64"-%"PRId64, + fd->refs->ref_id && ref_id >= 0 && ref_id < fd->refs->nref + ? fd->refs->ref_id[ref_id]->name + : "unknown", + s->hdr->ref_seq_start, s->hdr->ref_seq_start + s->hdr->ref_seq_span-1); return -1; } @@ -3247,6 +3250,10 @@ cram_slice *cram_next_slice(cram_fd *fd, cram_container **cp) { if (fd->ooc) break; +// printf("%p %d:%ld-%ld vs %d:%ld-%ld\n", fd, +// c_next->ref_seq_id, c_next->ref_seq_start, c_next->ref_seq_start+c_next->ref_seq_span-1, +// fd->range.refid, fd->range.start, fd->range.end); + /* Skip containers not yet spanning our range */ if (fd->range.refid != -2 && c_next->ref_seq_id != -2) { // ref_id beyond end of range; bail out diff --git a/htslib/cram/cram_encode.c b/htslib/cram/cram_encode.c index 5d22db54..6fe797a6 100644 --- a/htslib/cram/cram_encode.c +++ b/htslib/cram/cram_encode.c @@ -1761,7 +1761,6 @@ static int cram_generate_reference(cram_container *c, cram_slice *s, int r1) { c->ref_start = ref_start+1; c->ref_end = ref_end+1; c->ref_free = 1; - return 0; err: @@ -1843,7 +1842,7 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { // Don't try embed ref if we repeatedly fail pthread_mutex_lock(&fd->ref_lock); int failed_embed = (fd->no_ref_counter >= 5); // maximum 5 tries - if (!failed_embed && c->embed_ref == -2) { + if (!failed_embed && c->embed_ref == -2 && c->ref_id >= 0) { hts_log_warning("Retrying embed_ref=2 mode for #%d/5", fd->no_ref_counter); fd->no_ref = c->no_ref = 0; fd->embed_ref = c->embed_ref = 2; @@ -1922,6 +1921,12 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { // Do not confuse with fd->ref_free which is a pointer to a // reference string to free. c->ref_free = 1; + } else { + // Double check for broken input. We shouldn't have + // embedded references enabled for unmapped data, but our + // data could be broken. + embed_ref = 0; + no_ref = c->no_ref = 1; } } c->ref_seq_id = c->ref_id; @@ -1968,7 +1973,7 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { // Embed consensus / MD-generated ref if (embed_ref == 2) { - if (cram_generate_reference(c, s, r1) < 0) { + if (c->ref_id < 0 || cram_generate_reference(c, s, r1) < 0) { // Should this be a permanent thing via fd->no_ref? // Doing so means we cannot easily switch back again should // things fix themselves later on. This is likely not a @@ -1997,6 +2002,9 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { fd->no_ref_counter -= (fd->no_ref_counter > 0); pthread_mutex_unlock(&fd->ref_lock); } + + if (c->ref_end > fd->refs->ref_id[c->ref_id]->LN_length) + c->ref_end = fd->refs->ref_id[c->ref_id]->LN_length; } // Iterate through records creating the cram blocks for some @@ -3683,7 +3691,8 @@ static int process_one_read(cram_fd *fd, cram_container *c, return -1; } fake_qual = spos; - cr->aend = no_ref ? apos : MIN(apos, c->ref_end); + // Protect against negative length refs (fuzz 382922241) + cr->aend = no_ref ? apos : MIN(apos, MAX(0, c->ref_end)); if (cram_stats_add(c->stats[DS_FN], cr->nfeature) < 0) goto block_err; diff --git a/htslib/cram/cram_index.c b/htslib/cram/cram_index.c index 77c953d6..b08ca5bf 100644 --- a/htslib/cram/cram_index.c +++ b/htslib/cram/cram_index.c @@ -580,9 +580,7 @@ int cram_seek_to_refpos(cram_fd *fd, cram_range *r) { // Ideally use an index, so see if we have one. if ((e = cram_index_query(fd, r->refid, r->start, NULL))) { if (0 != cram_seek(fd, e->offset, SEEK_SET)) { - if (0 != cram_seek(fd, e->offset - fd->first_container, SEEK_CUR)) { - ret = -1; goto err; - } + ret = -1; goto err; } } else { // Absent from index, but this most likely means it simply has no data. diff --git a/htslib/cram/cram_io.c b/htslib/cram/cram_io.c index 7f7ffca4..997c9c39 100644 --- a/htslib/cram/cram_io.c +++ b/htslib/cram/cram_io.c @@ -1,5 +1,5 @@ /* -Copyright (c) 2012-2024 Genome Research Ltd. +Copyright (c) 2012-2025 Genome Research Ltd. Author: James Bonfield Redistribution and use in source and binary forms, with or without @@ -76,6 +76,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "cram.h" #include "os.h" #include "../htslib/hts.h" +#include "../hts_internal.h" #include "open_trace_file.h" #if defined(HAVE_EXTERNAL_LIBHTSCODECS) @@ -1698,8 +1699,7 @@ int cram_uncompress_block(cram_block *b) { free(uncomp); return -1; } - b->orig_method = RANS_PR0 + (b->data[0]&1) - + 2*((b->data[0]&0x40)>0) + 4*((b->data[0]&0x80)>0); + b->orig_method = RANSPR; free(b->data); b->data = (unsigned char *)uncomp; b->alloc = usize2; @@ -1718,8 +1718,7 @@ int cram_uncompress_block(cram_block *b) { free(uncomp); return -1; } - b->orig_method = ARITH_PR0 + (b->data[0]&1) - + 2*((b->data[0]&0x40)>0) + 4*((b->data[0]&0x80)>0); + b->orig_method = ARITH; free(b->data); b->data = (unsigned char *)uncomp; b->alloc = usize2; @@ -2475,6 +2474,9 @@ static refs_t *refs_create(void) { static BGZF *bgzf_open_ref(char *fn, char *mode, int is_md5) { BGZF *fp; + if (strncmp(fn, "file://", 7) == 0) + fn += 7; + if (!is_md5 && !hisremote(fn)) { char fai_file[PATH_MAX]; @@ -2787,9 +2789,15 @@ static int refs_from_header(cram_fd *fd) { /* Initialise likely filename if known */ if ((ty = sam_hrecs_find_type_id(h->hrecs, "SQ", "SN", h->hrecs->ref[i].name))) { - if ((tag = sam_hrecs_find_key(ty, "M5", NULL))) { + if ((tag = sam_hrecs_find_key(ty, "M5", NULL))) r->ref_id[j]->fn = string_dup(r->pool, tag->str+3); - //fprintf(stderr, "Tagging @SQ %s / %s\n", r->ref_id[h]->name, r->ref_id[h]->fn); + + if ((tag = sam_hrecs_find_key(ty, "LN", NULL))) { + // LN tag used when constructing consensus reference + r->ref_id[j]->LN_length = strtoll(tag->str+3, NULL, 0); + // See fuzz 382922241 + if (r->ref_id[j]->LN_length < 0) + r->ref_id[j]->LN_length = 0; } } @@ -2930,30 +2938,6 @@ static void mkdir_prefix(char *path, int mode) { *cp = '/'; } -/* - * Return the cache directory to use, based on the first of these - * environment variables to be set to a non-empty value. - */ -static const char *get_cache_basedir(const char **extra) { - char *base; - - *extra = ""; - - base = getenv("XDG_CACHE_HOME"); - if (base && *base) return base; - - base = getenv("HOME"); - if (base && *base) { *extra = "/.cache"; return base; } - - base = getenv("TMPDIR"); - if (base && *base) return base; - - base = getenv("TEMP"); - if (base && *base) return base; - - return "/tmp"; -} - /* * Queries the M5 string from the header and attempts to populate the * reference from this using the REF_PATH environment. @@ -2967,31 +2951,12 @@ static int cram_populate_ref(cram_fd *fd, int id, ref_entry *r) { sam_hrec_tag_t *tag; char path[PATH_MAX]; kstring_t path_tmp = KS_INITIALIZE; - char cache[PATH_MAX], cache_root[PATH_MAX]; char *local_cache = getenv("REF_CACHE"); mFILE *mf; int local_path = 0; hts_log_info("Running cram_populate_ref on fd %p, id %d", (void *)fd, id); - cache_root[0] = '\0'; - - if (!ref_path || *ref_path == '\0') { - /* - * If we have no ref path, we use the EBI server. - * However to avoid spamming it we require a local ref cache too. - */ - ref_path = "https://www.ebi.ac.uk/ena/cram/md5/%s"; - if (!local_cache || *local_cache == '\0') { - const char *extra; - const char *base = get_cache_basedir(&extra); - snprintf(cache_root, PATH_MAX, "%s%s/hts-ref", base, extra); - snprintf(cache,PATH_MAX, "%s%s/hts-ref/%%2s/%%2s/%%s", base, extra); - local_cache = cache; - hts_log_info("Populating local cache: %s", local_cache); - } - } - if (!r->name) return -1; @@ -3005,7 +2970,10 @@ static int cram_populate_ref(cram_fd *fd, int id, ref_entry *r) { /* Use cache if available */ if (local_cache && *local_cache) { - if (expand_cache_path(path, local_cache, tag->str+3) == 0) + struct stat sb; + if (expand_cache_path(path, local_cache, tag->str+3) == 0 && + stat(path, &sb) == 0) + // Found it in the local cache local_path = 1; } @@ -3049,7 +3017,8 @@ static int cram_populate_ref(cram_fd *fd, int id, ref_entry *r) { /* Otherwise search full REF_PATH; slower as loads entire file */ - if ((mf = open_path_mfile(tag->str+3, ref_path, NULL))) { + int is_local = 0; + if ((mf = open_path_mfile(tag->str+3, ref_path, NULL, &is_local))) { size_t sz; r->seq = mfsteal(mf, &sz); if (r->seq) { @@ -3065,15 +3034,23 @@ static int cram_populate_ref(cram_fd *fd, int id, ref_entry *r) { } else { refs_t *refs; const char *fn; + sam_hrec_tag_t *UR_tag; no_M5: /* Failed to find in search path or M5 cache, see if @SQ UR: tag? */ - if (!(tag = sam_hrecs_find_key(ty, "UR", NULL))) + if (!(UR_tag = sam_hrecs_find_key(ty, "UR", NULL))) return -1; - fn = (strncmp(tag->str+3, "file:", 5) == 0) - ? tag->str+8 - : tag->str+3; + if (strstr(UR_tag->str+3, "://") && + strncmp(UR_tag->str+3, "file:", 5) != 0) { + // Documented as omitted, but accidentally supported until now + hts_log_error("UR tags pointing to remote files are not supported"); + return -1; + } + + fn = (strncmp(UR_tag->str+3, "file:", 5) == 0) + ? UR_tag->str+8 + : UR_tag->str+3; if (fd->refs->fp) { if (bgzf_close(fd->refs->fp) != 0) @@ -3104,15 +3081,9 @@ static int cram_populate_ref(cram_fd *fd, int id, ref_entry *r) { } /* Populate the local disk cache if required */ - if (local_cache && *local_cache) { + if (!is_local && local_cache && *local_cache) { hFILE *fp; - if (*cache_root && !is_directory(cache_root)) { - hts_log_warning("Creating reference cache directory %s\n" - "This may become large; see the samtools(1) manual page REF_CACHE discussion", - cache_root); - } - if (expand_cache_path(path, local_cache, tag->str+3) < 0) { return 0; // Not fatal - we have the data already so keep going. } @@ -3470,7 +3441,9 @@ char *cram_get_ref(cram_fd *fd, int id, hts_pos_t start, hts_pos_t end) { hts_log_warning("Reference file given, but ref '%s' not present", r->name); if (cram_populate_ref(fd, id, r) == -1) { - hts_log_warning("Failed to populate reference for id %d", id); + hts_log_warning("Failed to populate reference \"%s\"", + r->name); + hts_log_warning("See https://www.htslib.org/doc/reference_seqs.html for further suggestions"); pthread_mutex_unlock(&fd->refs->lock); pthread_mutex_unlock(&fd->ref_lock); return NULL; @@ -4320,7 +4293,7 @@ int cram_flush_container_mt(cram_fd *fd, cram_container *c) { if (!pending) break; - usleep(1000); + hts_usleep(1000); } return 0; @@ -4952,6 +4925,8 @@ int cram_write_SAM_hdr(cram_fd *fd, sam_hdr_t *hdr) { hts_log_warning("NOTE: the CRAM file will be bigger " "than using an external reference"); pthread_mutex_lock(&fd->ref_lock); + // Best guess. It may be unmapped data with broken + // headers, in which case this will get ignored. fd->embed_ref = 2; pthread_mutex_unlock(&fd->ref_lock); break; @@ -5257,7 +5232,7 @@ static void cram_init_tables(cram_fd *fd) { // Default version numbers for CRAM static int major_version = 3; -static int minor_version = 0; +static int minor_version = 1; /* * Opens a CRAM file for read (mode "rb") or write ("wb"). @@ -5434,28 +5409,11 @@ cram_fd *cram_dopen(hFILE *fp, const char *filename, const char *mode) { * -1 on failure */ int cram_seek(cram_fd *fd, off_t offset, int whence) { - char buf[65536]; - fd->ooc = 0; cram_drain_rqueue(fd); - if (hseek(fd->fp, offset, whence) >= 0) { - return 0; - } - - if (!(whence == SEEK_CUR && offset >= 0)) - return -1; - - /* Couldn't fseek, but we're in SEEK_CUR mode so read instead */ - while (offset > 0) { - int len = MIN(65536, offset); - if (len != hread(fd->fp, buf, len)) - return -1; - offset -= len; - } - - return 0; + return hseek(fd->fp, offset, whence) >= 0 ? 0 : -1; } /* @@ -5814,6 +5772,8 @@ int cram_set_voption(cram_fd *fd, enum hts_fmt_option opt, va_list args) { case CRAM_OPT_RANGE: { int r = cram_seek_to_refpos(fd, va_arg(args, cram_range *)); pthread_mutex_lock(&fd->range_lock); +// printf("opt range noseek to %p %d:%ld-%ld\n", +// fd, fd->range.refid, fd->range.start, fd->range.end); if (fd->range.refid != -2) fd->required_fields |= SAM_POS; pthread_mutex_unlock(&fd->range_lock); diff --git a/htslib/cram/cram_structs.h b/htslib/cram/cram_structs.h index 9540b561..49c2e81d 100644 --- a/htslib/cram/cram_structs.h +++ b/htslib/cram/cram_structs.h @@ -671,7 +671,8 @@ struct cram_slice { typedef struct ref_entry { char *name; char *fn; - int64_t length; + int64_t length; // if 0 this indicates we haven't loaded it yet + int64_t LN_length; // @SQ LN length, used to trim consensus ref int64_t offset; int bases_per_line; int line_length; diff --git a/htslib/cram/open_trace_file.c b/htslib/cram/open_trace_file.c index 4d617b73..9dc2f964 100644 --- a/htslib/cram/open_trace_file.c +++ b/htslib/cram/open_trace_file.c @@ -176,7 +176,8 @@ char *tokenise_search_path(const char *searchpath) { return newsearch; } -static char *expand_path(const char *file, char *dirname, int max_s_digits); +static char *expand_path(const char *file, const char *dirname, + int max_s_digits); mFILE *find_file_url(const char *file, char *url) { char *path = NULL, buf[8192]; @@ -226,10 +227,12 @@ mFILE *find_file_url(const char *file, char *url) { * * Returns expanded pathname or NULL for malloc failure. */ -static char *expand_path(const char *file, char *dirname, int max_s_digits) { +static char *expand_path(const char *file, const char *dirname, + int max_s_digits) { size_t len = strlen(dirname); size_t lenf = strlen(file); - char *cp, *path; + const char *end_dirname = dirname + len, *cp; + char *path; path = malloc(len+lenf+2); // worst expansion DIR/FILE if (!path) { @@ -237,8 +240,11 @@ static char *expand_path(const char *file, char *dirname, int max_s_digits) { return NULL; } - if (dirname[len-1] == '/') + // Remove trailing '/'s, unless the path matches "/" + while (len > 1 && dirname[len-1] == '/') { len--; + end_dirname--; + } /* Special case for "./" or absolute filenames */ if (*file == '/' || (len==1 && *dirname == '.')) { @@ -246,41 +252,54 @@ static char *expand_path(const char *file, char *dirname, int max_s_digits) { } else { /* Handle %[0-9]*s expansions, if required */ char *path_end = path; - *path = 0; + while ((cp = strchr(dirname, '%'))) { char *endp; + // Get optional length long l = strtol(cp+1, &endp, 10); - if (*endp != 's' || endp - cp - 1 > max_s_digits) { - strncpy(path_end, dirname, (endp+1)-dirname); - path_end += (endp+1)-dirname; - dirname = endp+1; + if (*endp != 's' || l < 0 || endp - cp - 1 > max_s_digits) { + // Not %[0-9]s. Copy over directly, taking care of edge cases + // like the string ending with '%' or '%[0-9]*'. + const char *e = MIN(endp+1, end_dirname); + memcpy(path_end, dirname, e - dirname); + path_end += e - dirname; + dirname = e; continue; } - strncpy(path_end, dirname, cp-dirname); + // Copy part up to '%' + memcpy(path_end, dirname, cp-dirname); path_end += cp-dirname; - if (l) { - strncpy(path_end, file, l); - path_end += MIN(strlen(file), l); - file += MIN(strlen(file), l); - } else { - strcpy(path_end, file); - path_end += strlen(file); - file += strlen(file); - } - len -= (endp+1) - dirname; + + // Insert segment from file + size_t to_copy = l > 0 ? MIN(lenf, l) : lenf; + memcpy(path_end, file, to_copy); + path_end += to_copy; + file += to_copy; + lenf -= to_copy; + + // Skip to part of dirname after the 's' dirname = endp+1; } - strncpy(path_end, dirname, len); - path_end += MIN(strlen(dirname), len); - *path_end = 0; + + // Add anything left in dirname + if (dirname < end_dirname) { + memcpy(path_end, dirname, end_dirname - dirname); + path_end += end_dirname - dirname; + } + if (*file) { - *path_end++ = '/'; - strcpy(path_end, file); + // Add remainder of file + if (path_end > path && *(path_end - 1) != '/') + *path_end++ = '/'; + memcpy(path_end, file, lenf); + path_end += lenf; } + // Terminate string + *path_end = '\0'; } - //fprintf(stderr, "*PATH=\"%s\"\n", path); + // fprintf(stderr, "*PATH=\"%s\"\n", path); return path; } @@ -324,14 +343,21 @@ static mFILE *find_file_dir(const char *file, char *dirname) { * all of the locations listed in 'path' (which is a colon separated list). * If 'path' is NULL it uses the RAWDATA environment variable instead. * + * If non-NULL *local is filled out to 1 for a local file and 0 for a remote + * URL. + * * Returns a mFILE pointer when found. * NULL otherwise. */ -mFILE *open_path_mfile(const char *file, char *path, char *relative_to) { +mFILE *open_path_mfile(const char *file, char *path, char *relative_to, + int *local) { char *newsearch; char *ele; mFILE *fp; + if (local) + *local = 1; + /* Use path first */ if (!path) path = getenv("RAWDATA"); @@ -361,14 +387,16 @@ mFILE *open_path_mfile(const char *file, char *path, char *relative_to) { if (0 == strncmp(ele2, "URL=", 4)) { if ((fp = find_file_url(file, ele2+4))) { + if (local) + *local = strncmp(ele2+4, "file:", 5) == 0 ? 1 : 0; free(newsearch); return fp; } - } else if (!strncmp(ele2, "http:", 5) || - !strncmp(ele2, "https:", 6) || - !strncmp(ele2, "ftp:", 4)) { + } else if (hisremote(ele2)) { if ((fp = find_file_url(file, ele2))) { free(newsearch); + if (local) + *local = 0; return fp; } } else if ((fp = find_file_dir(file, ele2))) { diff --git a/htslib/cram/open_trace_file.h b/htslib/cram/open_trace_file.h index 45860980..b7b6a4a7 100644 --- a/htslib/cram/open_trace_file.h +++ b/htslib/cram/open_trace_file.h @@ -96,10 +96,14 @@ char *tokenise_search_path(const char *searchpath); * all of the locations listed in 'path' (which is a colon separated list). * If 'path' is NULL it uses the RAWDATA environment variable instead. * + * If non-NULL *local is filled out to 1 for a local file and 0 for a remote + * URL. + * * Returns a mFILE pointer when found. * NULL otherwise. */ -mFILE *open_path_mfile(const char *file, char *path, char *relative_to); +mFILE *open_path_mfile(const char *file, char *path, char *relative_to, + int *local); /* * Returns a mFILE containing the entire contents of the url; diff --git a/htslib/header.c b/htslib/header.c index 7f62074f..02283f60 100644 --- a/htslib/header.c +++ b/htslib/header.c @@ -1,5 +1,5 @@ /* -Copyright (c) 2018-2020, 2023 Genome Research Ltd. +Copyright (c) 2018-2020, 2023, 2025 Genome Research Ltd. Authors: James Bonfield , Valeriu Ohan Redistribution and use in source and binary forms, with or without @@ -145,7 +145,7 @@ static int sam_hrecs_update_hashes(sam_hrecs_t *hrecs, const char *name = NULL; const char *altnames = NULL; hts_pos_t len = -1; - int r; + int r, invLN = 0; khint_t k; while (tag) { @@ -154,7 +154,11 @@ static int sam_hrecs_update_hashes(sam_hrecs_t *hrecs, name = tag->str+3; } else if (tag->str[0] == 'L' && tag->str[1] == 'N') { assert(tag->len >= 3); + hts_pos_t tmp = len; len = strtoll(tag->str+3, NULL, 10); + if (tmp != -1 && tmp != len) { //duplicate and different LN + invLN = 1; + } } else if (tag->str[0] == 'A' && tag->str[1] == 'N') { assert(tag->len >= 3); altnames = tag->str+3; @@ -173,6 +177,12 @@ static int sam_hrecs_update_hashes(sam_hrecs_t *hrecs, return -1; // LN should be present, according to spec. } + if (invLN) { + hts_log_error("Header includes @SQ line \"%s\" with multiple LN:" + " tag with different values.", name); + return -1; // LN should not be duplicated or be same + } + // Seen already? k = kh_get(m_s2i, hrecs->ref_hash, name); if (k < kh_end(hrecs->ref_hash)) { @@ -1450,7 +1460,7 @@ int sam_hdr_remove_line_id(sam_hdr_t *bh, const char *type, const char *ID_key, int sam_hdr_remove_line_pos(sam_hdr_t *bh, const char *type, int position) { sam_hrecs_t *hrecs; - if (!bh || !type || position <= 0) + if (!bh || !type || position < 0) return -1; if (!(hrecs = bh->hrecs)) { diff --git a/htslib/hfile.c b/htslib/hfile.c index 552b7177..3b60bedd 100644 --- a/htslib/hfile.c +++ b/htslib/hfile.c @@ -107,12 +107,20 @@ hFILE *hfile_init(size_t struct_size, const char *mode, size_t capacity) hFILE *fp = (hFILE *) malloc(struct_size); if (fp == NULL) goto error; - if (capacity == 0) capacity = 32768; + const int maxcap = 128*1024; + + if (capacity == 0) capacity = maxcap; // FIXME For now, clamp input buffer sizes so mpileup doesn't eat memory - if (strchr(mode, 'r') && capacity > 32768) capacity = 32768; + if (strchr(mode, 'r') && capacity > maxcap) capacity = maxcap; +#ifdef HAVE_POSIX_MEMALIGN + fp->buffer = NULL; + if (posix_memalign((void **)&fp->buffer, 256, capacity) < 0) + goto error; +#else fp->buffer = (char *) malloc(capacity); if (fp->buffer == NULL) goto error; +#endif fp->begin = fp->end = fp->buffer; fp->limit = &fp->buffer[capacity]; @@ -629,7 +637,12 @@ static size_t blksize(int fd) #ifdef HAVE_STRUCT_STAT_ST_BLKSIZE struct stat sbuf; if (fstat(fd, &sbuf) != 0) return 0; - return sbuf.st_blksize; + + // Pipes/FIFOs on linux return 4Kb here often, but it's much too small + // for performant I/O. + return S_ISFIFO(sbuf.st_mode) + ? 128*1024 + : sbuf.st_blksize; #else return 0; #endif diff --git a/htslib/hfile_libcurl.c b/htslib/hfile_libcurl.c index 6bbd88fe..3463acf4 100644 --- a/htslib/hfile_libcurl.c +++ b/htslib/hfile_libcurl.c @@ -34,7 +34,6 @@ DEALINGS IN THE SOFTWARE. */ #ifndef _WIN32 # include #endif -#include #include #include "hfile_internal.h" @@ -1247,19 +1246,6 @@ libcurl_open(const char *url, const char *modes, http_headers *headers) if (env_curl_ca_bundle) { err |= curl_easy_setopt(fp->easy, CURLOPT_CAINFO, env_curl_ca_bundle); } -#if defined __linux__ && defined BUILDING_WHEEL - else { - // Linux wheels are (currently) built on AlmaLinux, so the libcurl.so bundled - // into the wheel follows Alma/Red Hat/Fedora conventions for the location of - // its certificate bundle. This fails when the wheel is used on a Debian/Ubuntu - // platform with a different convention for this location. When not overridden - // by $CURL_CA_BUNDLE, work around this by specifying the expected Debian bundle - // location if the Red Hat one isn't present. - struct stat st; - if (stat("/etc/pki", &st) < 0 && errno == ENOENT) - err |= curl_easy_setopt(fp->easy, CURLOPT_CAINFO, "/etc/ssl/certs/ca-certificates.crt"); - } -#endif } err |= curl_easy_setopt(fp->easy, CURLOPT_USERAGENT, curl.useragent.s); if (fp->headers.callback) { diff --git a/htslib/hts.c b/htslib/hts.c index a8a8bead..14081e6f 100644 --- a/htslib/hts.c +++ b/htslib/hts.c @@ -232,6 +232,9 @@ const char *hts_feature_string(void) { } +// Converts ASCII to BAM nibble encoding. +// Note 0123 is treated as ACGT (ABI colourspace encoding) and +// U is treated as T. HTSLIB_EXPORT const unsigned char seq_nt16_table[256] = { 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, @@ -239,9 +242,9 @@ const unsigned char seq_nt16_table[256] = { 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 1, 2, 4, 8, 15,15,15,15, 15,15,15,15, 15, 0 /*=*/,15,15, 15, 1,14, 2, 13,15,15, 4, 11,15,15,12, 15, 3,15,15, - 15,15, 5, 6, 8,15, 7, 9, 15,10,15,15, 15,15,15,15, + 15,15, 5, 6, 8, 8, 7, 9, 15,10,15,15, 15,15,15,15, 15, 1,14, 2, 13,15,15, 4, 11,15,15,12, 15, 3,15,15, - 15,15, 5, 6, 8,15, 7, 9, 15,10,15,15, 15,15,15,15, + 15,15, 5, 6, 8, 8, 7, 9, 15,10,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, @@ -4288,7 +4291,7 @@ int hts_itr_multi_next(htsFile *fd, hts_itr_t *iter, void *r) if (iter->curr_off) { // seek to the start if (iter->seek(fp, iter->curr_off, SEEK_SET) < 0) { hts_log_error("Seek at offset %" PRIu64 " failed.", iter->curr_off); - return -1; + return -2; } iter->curr_off = 0; // only seek once } @@ -4366,7 +4369,7 @@ int hts_itr_multi_next(htsFile *fd, hts_itr_t *iter, void *r) next_range = 0; if (iter->seek(fp, iter->nocoor_off, SEEK_SET) < 0) { hts_log_error("Seek at offset %" PRIu64 " failed.", iter->nocoor_off); - return -1; + return -2; } if (iter->is_cram) { cram_range r = { HTS_IDX_NOCOOR }; @@ -4418,7 +4421,7 @@ int hts_itr_multi_next(htsFile *fd, hts_itr_t *iter, void *r) if (iter->seek(fp, iter->curr_off, SEEK_SET) < 0) { hts_log_error("Seek at offset %" PRIu64 " failed.", iter->curr_off); - return -1; + return -2; } // Find the genomic range matching this interval. @@ -4476,7 +4479,7 @@ int hts_itr_multi_next(htsFile *fd, hts_itr_t *iter, void *r) if (iter->seek(fp, iter->curr_off, SEEK_SET) < 0) { hts_log_error("Seek at offset %" PRIu64 " failed.", iter->curr_off); - return -1; + return -2; } } } diff --git a/htslib/hts_internal.h b/htslib/hts_internal.h index 52f29e6c..d1b25509 100644 --- a/htslib/hts_internal.h +++ b/htslib/hts_internal.h @@ -25,6 +25,7 @@ DEALINGS IN THE SOFTWARE. */ #include #include +#include #include "htslib/hts.h" #include "textutils_internal.h" @@ -129,7 +130,7 @@ static inline int find_file_extension(const char *fn, char ext_out[static HTS_MA if (!fn) return -1; if (!delim) delim = fn + strlen(fn); for (ext = delim; ext > fn && *ext != '.' && *ext != '/'; --ext) {} - if (*ext == '.' && + if (*ext == '.' && ext > fn && ((delim - ext == 3 && ext[1] == 'g' && ext[2] == 'z') || // permit .sam.gz as a valid file extension (delim - ext == 4 && ext[1] == 'b' && ext[2] == 'g' && ext[3] == 'z'))) // permit .vcf.bgz as a valid file extension { @@ -142,6 +143,12 @@ static inline int find_file_extension(const char *fn, char ext_out[static HTS_MA return 0; } +static inline int hts_usleep(long long usec) +{ + struct timespec req = { usec / 1000000, (usec % 1000000) * 1000 }; + return nanosleep(&req, NULL); +} + #ifdef __cplusplus } #endif diff --git a/htslib/htscodecs/htscodecs/arith_dynamic.c b/htslib/htscodecs/htscodecs/arith_dynamic.c index 37aca77b..9f891ee3 100644 --- a/htslib/htscodecs/htscodecs/arith_dynamic.c +++ b/htslib/htscodecs/htscodecs/arith_dynamic.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022 Genome Research Ltd. + * Copyright (c) 2019-2022, 2025 Genome Research Ltd. * Author(s): James Bonfield * * Redistribution and use in source and binary forms, with or without @@ -733,15 +733,17 @@ unsigned char *arith_compress_to(unsigned char *in, unsigned int in_size, unsigned int c_meta_len; uint8_t *rle = NULL, *packed = NULL; - if (in_size > INT_MAX) { + if (in_size > INT_MAX || (out && *out_size == 0)) { *out_size = 0; return NULL; } if (!out) { *out_size = arith_compress_bound(in_size, order); - if (!(out = malloc(*out_size))) + if (!(out = malloc(*out_size))) { + *out_size = 0; return NULL; + } } unsigned char *out_end = out + *out_size; @@ -751,24 +753,30 @@ unsigned char *arith_compress_to(unsigned char *in, unsigned int in_size, if (order & X_CAT) { out[0] = X_CAT; c_meta_len = 1 + var_put_u32(&out[1], out_end, in_size); + if (c_meta_len + in_size > *out_size) { + *out_size = 0; + return NULL; + } memcpy(out+c_meta_len, in, in_size); *out_size = in_size+c_meta_len; } if (order & X_STRIPE) { - int N = (order>>8); + int N = (order>>8) & 0xff; if (N == 0) N = 4; // default for compatibility with old tests - if (N > 255) - return NULL; + if (N > in_size) + N = in_size; unsigned char *transposed = malloc(in_size); unsigned int part_len[256]; unsigned int idx[256]; - if (!transposed) + if (!transposed) { + *out_size = 0; return NULL; - int i, j, x; + } + int i, j, x; for (i = 0; i < N; i++) { part_len[i] = in_size / N + ((in_size % N) > i); idx[i] = i ? idx[i-1] + part_len[i-1] : 0; // cumulative index @@ -788,6 +796,12 @@ unsigned char *arith_compress_to(unsigned char *in, unsigned int in_size, c_meta_len = 1; *out = order & ~X_NOSZ; c_meta_len += var_put_u32(out+c_meta_len, out_end, in_size); + if (c_meta_len >= *out_size) { + free(transposed); + *out_size = 0; + return NULL; + } + out[c_meta_len++] = N; out2_start = out2 = out+7+5*N; // shares a buffer with c_meta @@ -795,6 +809,7 @@ unsigned char *arith_compress_to(unsigned char *in, unsigned int in_size, // Brute force try all methods. // FIXME: optimise this bit. Maybe learn over time? int j, best_j = 0, best_sz = INT_MAX; + uint8_t *r; // Works OK with read names. The first byte is the most important, // as it has most variability (little-endian). After that it's @@ -843,24 +858,37 @@ unsigned char *arith_compress_to(unsigned char *in, unsigned int in_size, // {1, 128}}; for (j = 1; j <= m[MIN(i,3)][0]; j++) { + if (out2 - out > *out_size) + continue; // an error, but caught in best_sz check later + olen2 = *out_size - (out2 - out); //fprintf(stderr, "order=%d m=%d\n", order&3, m[MIN(i,4)][j]); if ((order&3) == 0 && (m[MIN(i,3)][j]&1)) continue; - arith_compress_to(transposed+idx[i], part_len[i], - out2, &olen2, m[MIN(i,3)][j] | X_NOSZ); - if (best_sz > olen2) { + r = arith_compress_to(transposed+idx[i], part_len[i], + out2, &olen2, m[MIN(i,3)][j] | X_NOSZ); + if (r && olen2 && best_sz > olen2) { best_sz = olen2; best_j = j; } } -// if (best_j == 0) // none desireable -// return NULL; + + if (best_sz == INT_MAX) { + free(transposed); + *out_size = 0; + return NULL; + } if (best_j != j-1) { olen2 = *out_size - (out2 - out); - arith_compress_to(transposed+idx[i], part_len[i], - out2, &olen2, m[MIN(i,3)][best_j] | X_NOSZ); + r = arith_compress_to(transposed+idx[i], part_len[i], + out2, &olen2, + m[MIN(i,3)][best_j] | X_NOSZ); + if (!r) { + free(transposed); + *out_size = 0; + return NULL; + } } out2 += olen2; c_meta_len += var_put_u32(out+c_meta_len, out_end, olen2); @@ -893,6 +921,10 @@ unsigned char *arith_compress_to(unsigned char *in, unsigned int in_size, // PACK 2, 4 or 8 symbols into one byte. int pmeta_len; uint64_t packed_len; + if (c_meta_len + 256 > *out_size) { + *out_size = 0; + return NULL; + } packed = hts_pack(in, in_size, out+c_meta_len, &pmeta_len, &packed_len); if (!packed) { out[0] &= ~X_PACK; @@ -934,6 +966,7 @@ unsigned char *arith_compress_to(unsigned char *in, unsigned int in_size, #else fprintf(stderr, "Htscodecs has been compiled without libbz2 support\n"); free(out); + *out_size = 0; return NULL; #endif @@ -945,25 +978,40 @@ unsigned char *arith_compress_to(unsigned char *in, unsigned int in_size, // *out_size = lzma_size; } else { + uint8_t *r; if (do_rle) { if (order == 0) - arith_compress_O0_RLE(in, in_size, out+c_meta_len, out_size); + r=arith_compress_O0_RLE(in, in_size, out+c_meta_len, out_size); else - arith_compress_O1_RLE(in, in_size, out+c_meta_len, out_size); + r=arith_compress_O1_RLE(in, in_size, out+c_meta_len, out_size); } else { //if (order == 2) // arith_compress_O2(in, in_size, out+c_meta_len, out_size); //else if (order == 1) - arith_compress_O1(in, in_size, out+c_meta_len, out_size); + r=arith_compress_O1(in, in_size, out+c_meta_len, out_size); else - arith_compress_O0(in, in_size, out+c_meta_len, out_size); + r=arith_compress_O0(in, in_size, out+c_meta_len, out_size); + } + + if (!r) { + free(rle); + free(packed); + *out_size = 0; + return NULL; } } if (*out_size >= in_size) { out[0] &= ~(3|X_EXT); // no entropy encoding, but keep e.g. PACK out[0] |= X_CAT | no_size; + + if (out + c_meta_len + in_size > out_end) { + free(rle); + free(packed); + *out_size = 0; + return NULL; + } memcpy(out+c_meta_len, in, in_size); *out_size = in_size; } diff --git a/htslib/htscodecs/htscodecs/htscodecs.h b/htslib/htscodecs/htscodecs/htscodecs.h index 8d67e67a..4b694d9a 100644 --- a/htslib/htscodecs/htscodecs/htscodecs.h +++ b/htslib/htscodecs/htscodecs/htscodecs.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2024 Genome Research Ltd. + * Copyright (c) 2021-2025 Genome Research Ltd. * Author(s): James Bonfield * * Redistribution and use in source and binary forms, with or without @@ -43,7 +43,7 @@ * Note currently this needs manually editing as it isn't automatically * updated by autoconf. */ -#define HTSCODECS_VERSION 100601 +#define HTSCODECS_VERSION 100603 /* * A const string form of the HTSCODECS_VERSION define. diff --git a/htslib/htscodecs/htscodecs/rANS_static4x16pr.c b/htslib/htscodecs/htscodecs/rANS_static4x16pr.c index 8c9a64ad..a16c17f0 100644 --- a/htslib/htscodecs/htscodecs/rANS_static4x16pr.c +++ b/htslib/htscodecs/htscodecs/rANS_static4x16pr.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2023 Genome Research Ltd. + * Copyright (c) 2017-2023, 2025 Genome Research Ltd. * Author(s): James Bonfield * * Redistribution and use in source and binary forms, with or without @@ -1164,7 +1164,7 @@ void rans_set_cpu(int opts) { unsigned char *rans_compress_to_4x16(unsigned char *in, unsigned int in_size, unsigned char *out,unsigned int *out_size, int order) { - if (in_size > INT_MAX) { + if (in_size > INT_MAX || (out && *out_size == 0)) { *out_size = 0; return NULL; } @@ -1177,8 +1177,10 @@ unsigned char *rans_compress_to_4x16(unsigned char *in, unsigned int in_size, *out_size = rans_compress_bound_4x16(in_size, order); if (*out_size == 0) return NULL; - if (!(out_free = out = malloc(*out_size))) + if (!(out_free = out = malloc(*out_size))) { + *out_size = 0; return NULL; + } } unsigned char *out_end = out + *out_size; @@ -1199,11 +1201,15 @@ unsigned char *rans_compress_to_4x16(unsigned char *in, unsigned int in_size, int N = (order>>8) & 0xff; if (N == 0) N = 4; // default for compatibility with old tests + if (N > in_size) + N = in_size; + unsigned char *transposed = malloc(in_size); unsigned int part_len[256]; unsigned int idx[256]; if (!transposed) { free(out_free); + *out_size = 0; return NULL; } int i, j, x; @@ -1241,6 +1247,13 @@ unsigned char *rans_compress_to_4x16(unsigned char *in, unsigned int in_size, c_meta_len = 1; *out = order & ~RANS_ORDER_NOSZ; c_meta_len += var_put_u32(out+c_meta_len, out_end, in_size); + if (c_meta_len >= *out_size) { + free(out_free); + free(transposed); + *out_size = 0; + return NULL; + } + out[c_meta_len++] = N; unsigned char *out_best = NULL; @@ -1249,7 +1262,8 @@ unsigned char *rans_compress_to_4x16(unsigned char *in, unsigned int in_size, out2_start = out2 = out+7+5*N; // shares a buffer with c_meta for (i = 0; i < N; i++) { // Brute force try all methods. - int j, m[] = {1,64,128,0}, best_j = 0, best_sz = in_size+10; + uint8_t *r; + int j, m[] = {1,64,128,0}, best_j = 0, best_sz = INT_MAX; for (j = 0; j < sizeof(m)/sizeof(*m); j++) { if ((order & m[j]) != m[j]) continue; @@ -1257,18 +1271,24 @@ unsigned char *rans_compress_to_4x16(unsigned char *in, unsigned int in_size, // order-1 *only*; bit check above cannot elide order-0 if ((order & RANS_ORDER_STRIPE_NO0) && (m[j]&1) == 0) continue; + + if (out2 - out > *out_size) + continue; // an error, but caught in best_sz check later + olen2 = *out_size - (out2 - out); - rans_compress_to_4x16(transposed+idx[i], part_len[i], - out2, &olen2, - m[j] | RANS_ORDER_NOSZ - | (order&RANS_ORDER_X32)); - if (best_sz > olen2) { + r = rans_compress_to_4x16(transposed+idx[i], part_len[i], + out2, &olen2, + m[j] | RANS_ORDER_NOSZ + | (order&RANS_ORDER_X32)); + if (r && olen2 && best_sz > olen2) { best_sz = olen2; best_j = j; if (j < sizeof(m)/sizeof(*m) && olen2 > out_best_len) { unsigned char *tmp = realloc(out_best, olen2); if (!tmp) { free(out_free); + free(transposed); + *out_size = 0; return NULL; } out_best = tmp; @@ -1279,6 +1299,15 @@ unsigned char *rans_compress_to_4x16(unsigned char *in, unsigned int in_size, memcpy(out_best, out2, olen2); } } + + if (best_sz == INT_MAX) { + free(out_best); + free(out_free); + free(transposed); + *out_size = 0; + return NULL; + } + if (best_j < sizeof(m)/sizeof(*m)) { // Copy the best compression to output buffer if not current memcpy(out2, out_best, best_sz); @@ -1301,6 +1330,12 @@ unsigned char *rans_compress_to_4x16(unsigned char *in, unsigned int in_size, out[0] = RANS_ORDER_CAT; c_meta_len = 1; c_meta_len += var_put_u32(&out[1], out_end, in_size); + + if (c_meta_len + in_size > *out_size) { + free(out_free); + *out_size = 0; + return NULL; + } if (in_size) memcpy(out+c_meta_len, in, in_size); *out_size = c_meta_len + in_size; @@ -1329,6 +1364,11 @@ unsigned char *rans_compress_to_4x16(unsigned char *in, unsigned int in_size, // PACK 2, 4 or 8 symbols into one byte. int pmeta_len; uint64_t packed_len; + if (c_meta_len + 256 > *out_size) { + free(out_free); + *out_size = 0; + return NULL; + } packed = hts_pack(in, in_size, out+c_meta_len, &pmeta_len, &packed_len); if (!packed) { out[0] &= ~RANS_ORDER_PACK; @@ -1357,6 +1397,7 @@ unsigned char *rans_compress_to_4x16(unsigned char *in, unsigned int in_size, c_rmeta_len = in_size+257; if (!(meta = malloc(c_rmeta_len))) { free(out_free); + *out_size = 0; return NULL; } @@ -1380,8 +1421,23 @@ unsigned char *rans_compress_to_4x16(unsigned char *in, unsigned int in_size, // Compress lengths with O0 and literals with O0/O1 ("order" param) int sz = var_put_u32(out+c_meta_len, out_end, rmeta_len*2), sz2; sz += var_put_u32(out+c_meta_len+sz, out_end, rle_len); + if ((c_meta_len+sz+5) > *out_size) { + free(out_free); + free(rle); + free(meta); + free(packed); + *out_size = 0; + return NULL; + } c_rmeta_len = *out_size - (c_meta_len+sz+5); - rans_enc_func(do_simd, 0)(meta, rmeta_len, out+c_meta_len+sz+5, &c_rmeta_len); + if (!rans_enc_func(do_simd, 0)(meta, rmeta_len, out+c_meta_len+sz+5, &c_rmeta_len)) { + free(out_free); + free(rle); + free(meta); + free(packed); + *out_size = 0; + return NULL; + } if (c_rmeta_len < rmeta_len) { sz2 = var_put_u32(out+c_meta_len+sz, out_end, c_rmeta_len); memmove(out+c_meta_len+sz+sz2, out+c_meta_len+sz+5, c_rmeta_len); @@ -1404,17 +1460,39 @@ unsigned char *rans_compress_to_4x16(unsigned char *in, unsigned int in_size, out[0] &= ~RANS_ORDER_RLE; } + if (c_meta_len > *out_size) { + free(out_free); + free(rle); + free(packed); + *out_size = 0; + return NULL; + } + *out_size -= c_meta_len; if (order && in_size < 8) { out[0] &= ~1; order &= ~1; } - rans_enc_func(do_simd, order)(in, in_size, out+c_meta_len, out_size); + if (!rans_enc_func(do_simd, order)(in, in_size, out+c_meta_len, out_size)) { + free(out_free); + free(rle); + free(packed); + *out_size = 0; + return NULL; + } if (*out_size >= in_size) { out[0] &= ~3; out[0] |= RANS_ORDER_CAT | no_size; + + if (out + c_meta_len + in_size > out_end) { + free(out_free); + free(rle); + free(packed); + *out_size = 0; + return NULL; + } if (in_size) memcpy(out+c_meta_len, in, in_size); *out_size = in_size; diff --git a/htslib/htscodecs/htscodecs/tokenise_name3.c b/htslib/htscodecs/htscodecs/tokenise_name3.c index 74935790..08e62615 100644 --- a/htslib/htscodecs/htscodecs/tokenise_name3.c +++ b/htslib/htscodecs/htscodecs/tokenise_name3.c @@ -1555,6 +1555,7 @@ uint8_t *tok3_encode_names(char *blk, int len, int level, int use_arith, if (compress(ctx->desc[i].buf, ctx->desc[i].buf_l, i&0xf, level, use_arith, out, &out_len) < 0) { free_context(ctx); + free(out); return NULL; } diff --git a/htslib/htscodecs/htscodecs/version.h b/htslib/htscodecs/htscodecs/version.h index 048dcab5..4ab8196c 100644 --- a/htslib/htscodecs/htscodecs/version.h +++ b/htslib/htscodecs/htscodecs/version.h @@ -1 +1 @@ -#define HTSCODECS_VERSION_TEXT "1.6.1" +#define HTSCODECS_VERSION_TEXT "1.6.3" diff --git a/htslib/htslib.pc.in b/htslib/htslib.pc.in index d969d6b4..fdeeba92 100644 --- a/htslib/htslib.pc.in +++ b/htslib/htslib.pc.in @@ -11,5 +11,5 @@ Description: C library for high-throughput sequencing data formats Version: @-PACKAGE_VERSION@ Cflags: -I${includedir} Libs: -L${libdir} -lhts -Libs.private: -L${libdir} @private_LIBS@ -lhts -lm -lpthread +Libs.private: -L${libdir} @private_LIBS@ -lm -lpthread Requires.private: zlib @pc_requires@ diff --git a/htslib/htslib/bgzf.h b/htslib/htslib/bgzf.h index 87d4c6a3..36a4ff77 100644 --- a/htslib/htslib/bgzf.h +++ b/htslib/htslib/bgzf.h @@ -331,7 +331,7 @@ ssize_t bgzf_write_small(BGZF *fp, const void *data, size_t length) { /** * Read one byte from a BGZF file. It is faster than bgzf_read() * @param fp BGZF file handler - * @return byte read; -1 on end-of-file or error + * @return byte read; -1 on end-of-file; <= -2 on error */ HTSLIB_EXPORT int bgzf_getc(BGZF *fp); diff --git a/htslib/htslib/hts.h b/htslib/htslib/hts.h index 4f85424c..3e78fa5c 100644 --- a/htslib/htslib/hts.h +++ b/htslib/htslib/hts.h @@ -455,6 +455,7 @@ int hts_parse_opt_list(htsFormat *opt, const char *str); The input character may be either an IUPAC ambiguity code, '=' for 0, or '0'/'1'/'2'/'3' for a result of 1/2/4/8. The result is encoded as 1/2/4/8 for A/C/G/T or combinations of these bits for ambiguous bases. +Additionally RNA U is treated as a T (8). */ HTSLIB_EXPORT extern const unsigned char seq_nt16_table[256]; @@ -489,7 +490,7 @@ const char *hts_version(void); // Immediately after release, bump ZZ to 90 to distinguish in-development // Git repository builds from the release; you may wish to increment this // further when significant features are merged. -#define HTS_VERSION 102100 +#define HTS_VERSION 102200 /*! @abstract Introspection on the features enabled in htslib * @@ -1517,6 +1518,14 @@ static inline int hts_bin_level(int bin) { return l; } +/************************************** + * Exposing the CRC32 implementation * + * Either from zlib or libdeflate. * + *************************************/ +HTSLIB_EXPORT +uint32_t hts_crc32(uint32_t crc, const void *buf, size_t len); + + //! Compute the corresponding entry into the linear index of a given bin from //! a binning index /*! diff --git a/htslib/htslib/hts_defs.h b/htslib/htslib/hts_defs.h index b5cded34..71eb88d4 100644 --- a/htslib/htslib/hts_defs.h +++ b/htslib/htslib/hts_defs.h @@ -58,6 +58,12 @@ DEALINGS IN THE SOFTWARE. */ #define HTS_NORETURN #endif +#if HTS_GCC_AT_LEAST(10,1) +#define HTS_ACCESS(access_mode, ...) __attribute__ ((access(access_mode, __VA_ARGS__))) +#else +#define HTS_ACCESS(access_mode, ...) +#endif + // Enable optimisation level 3, especially for gcc. To be used // where we want to force vectorisation in hot loops and the default -O2 // just doesn't cut it. diff --git a/htslib/htslib/kstring.h b/htslib/htslib/kstring.h index ebb2f936..8c4245c2 100644 --- a/htslib/htslib/kstring.h +++ b/htslib/htslib/kstring.h @@ -1,7 +1,7 @@ /* The MIT License Copyright (C) 2011 by Attractive Chaos - Copyright (C) 2013-2014, 2016, 2018-2020, 2022, 2024 Genome Research Ltd. + Copyright (C) 2013-2014, 2016, 2018-2020, 2022, 2024-2025 Genome Research Ltd. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the @@ -62,6 +62,11 @@ #define ssize_t intptr_t #endif +#ifndef EOVERFLOW +#define HTSLIB_EOVERFLOW +#define EOVERFLOW ERANGE +#endif + /* kstring_t is a simple non-opaque type whose fields are likely to be * used directly by user code (but see also ks_str() and ks_len() below). * A kstring_t object is initialised by either of @@ -171,8 +176,7 @@ static inline int ks_expand(kstring_t *s, size_t expansion) { size_t new_size = s->l + expansion; - if (new_size < s->l) // Overflow check - return -1; + if (new_size < s->l) { errno = EOVERFLOW; return -1; } return ks_resize(s, new_size); } @@ -234,8 +238,8 @@ static inline void ks_free(kstring_t *s) static inline int kputsn(const char *p, size_t l, kstring_t *s) { size_t new_sz = s->l + l + 2; - if (new_sz <= s->l || ks_resize(s, new_sz) < 0) - return EOF; + if (new_sz <= s->l) { errno = EOVERFLOW; return EOF; } + if (ks_resize(s, new_sz) < 0) return EOF; memcpy(s->s + s->l, p, l); s->l += l; s->s[s->l] = 0; @@ -268,8 +272,8 @@ static inline int kputc_(int c, kstring_t *s) static inline int kputsn_(const void *p, size_t l, kstring_t *s) { size_t new_sz = s->l + l; - if (new_sz < s->l || ks_resize(s, new_sz ? new_sz : 1) < 0) - return EOF; + if (new_sz < s->l) { errno = EOVERFLOW; return EOF; } + if (ks_resize(s, new_sz ? new_sz : 1) < 0) return EOF; memcpy(s->s + s->l, p, l); s->l += l; return l; @@ -449,9 +453,65 @@ static inline int *ksplit(kstring_t *s, int delimiter, int *n) return offsets; } +/** + * kinsert_char - inserts a char to kstring + * @param c - char to insert + * @param pos - position at which to insert, starting from 0 + * @param s - pointer to output string + * Returns 0 on success and -1 on failure + * 0 for pos inserts at start and length of current string as pos appends at + * the end. + */ +static inline int kinsert_char(char c, size_t pos, kstring_t *s) +{ + if (!s || pos > s->l) { + return EOF; + } + if (ks_resize(s, s->l + 2) < 0) { + return EOF; + } + memmove(s->s + pos + 1, s->s + pos, s->l - pos); + s->s[pos] = c; + s->s[++s->l] = 0; + return 0; +} + +/** + * kinsert_str - inserts a null terminated string to kstring + * @param str - string to insert + * @param pos - position at which to insert, starting from 0 + * @param s - pointer to output string + * Returns 0 on success and -1 on failure + * 0 for pos inserts at start and length of current string as pos appends at + * the end. empty string makes no update. + */ +static inline int kinsert_str(const char *str, size_t pos, kstring_t *s) +{ + size_t len = 0; + if (!s || pos > s->l || !str) { + return EOF; + } + if (!(len = strlen(str))) { + return 0; + } + if (ks_resize(s, s->l + len + 1) < 0) { + return EOF; + } + memmove(s->s + pos + len, s->s + pos, s->l - pos); + memcpy(s->s + pos, str, len); + s->l += len; + s->s[s->l] = '\0'; + return 0; +} + #ifdef HTSLIB_SSIZE_T #undef HTSLIB_SSIZE_T #undef ssize_t #endif +#ifdef HTSLIB_EOVERFLOW +#undef HTSLIB_EOVERFLOW +#undef EOVERFLOW +#endif + #endif diff --git a/htslib/htslib/sam.h b/htslib/htslib/sam.h index 0da5f047..51cdbc4d 100644 --- a/htslib/htslib/sam.h +++ b/htslib/htslib/sam.h @@ -245,7 +245,7 @@ typedef struct bam1_core_t { while core.l_extranul counts the excess NULs (so 0 <= l_extranul <= 3). 3. Cigar data is encoded 4 bytes per CIGAR operation. See the bam_cigar_* macros for manipulation. - 4. seq is nibble-encoded according to bam_nt16_table. + 4. seq is nibble-encoded according to seq_nt16_table. See the bam_seqi macro for retrieving individual bases. 5. Per base qualities are stored in the Phred scale with no +33 offset. Ie as per the BAM specification and not the SAM ASCII printable method. @@ -1031,7 +1031,7 @@ bam1_t *bam_dup1(const bam1_t *bsrc); @param isize Observed template length ("insert size") (a.k.a. TLEN). @param l_seq Length of the query sequence (read) and sequence quality string. @param seq Sequence, may be NULL if l_seq = 0. - @param qual Sequence quality, may be NULL. + @param qual Sequence quality, may be NULL. Should be provided without ASCII 33 offset. @param l_aux Length to be reserved for auxiliary field data, may be 0. @return >= 0 on success (number of bytes written to bam->data), negative (with errno set) on failure. diff --git a/htslib/htslib/synced_bcf_reader.h b/htslib/htslib/synced_bcf_reader.h index 9a6b4843..a81ac371 100644 --- a/htslib/htslib/synced_bcf_reader.h +++ b/htslib/htslib/synced_bcf_reader.h @@ -1,7 +1,7 @@ /// @file htslib/synced_bcf_reader.h /// Stream through multiple VCF files. /* - Copyright (C) 2012-2017, 2019-2023 Genome Research Ltd. + Copyright (C) 2012-2017, 2019-2025 Genome Research Ltd. Author: Petr Danecek @@ -89,6 +89,7 @@ extern "C" { #define BCF_SR_PAIR_SNP_REF (1<<4) // allow REF-only records with SNPs #define BCF_SR_PAIR_INDEL_REF (1<<5) // allow REF-only records with indels #define BCF_SR_PAIR_EXACT (1<<6) // require the exact same set of alleles in all files +#define BCF_SR_PAIR_ID (1<<7) // require matching IDs (overlap) #define BCF_SR_PAIR_BOTH (BCF_SR_PAIR_SNPS|BCF_SR_PAIR_INDELS) #define BCF_SR_PAIR_BOTH_REF (BCF_SR_PAIR_SNPS|BCF_SR_PAIR_INDELS|BCF_SR_PAIR_SNP_REF|BCF_SR_PAIR_INDEL_REF) @@ -232,10 +233,30 @@ void bcf_sr_destroy_threads(bcf_srs_t *files); * * See also the bcf_srs_t data structure for parameters controlling * the reader's logic. + * Invokes bcf_sr_add_hreader with opened file */ HTSLIB_EXPORT int bcf_sr_add_reader(bcf_srs_t *readers, const char *fname); +/** + * bcf_sr_add_hreader() - open new reader using htsfile + * @readers: holder of the open readers + * @file_ptr: htsfile already opened + * @autoclose: close file along with reader or not, 1 - close, 0 - do not close + * @idxname: index file name for file in @file_ptr + * + * Returns 1 if the call succeeded, or 0 on error. + * + * See also the bcf_srs_t data structure for parameters controlling + * the reader's logic. + * If idxname is NULL, uses file_ptr->fn to find index file. + * With idxname as NULL, index file must be present along with the file with + * default name + */ +HTSLIB_EXPORT +int bcf_sr_add_hreader(bcf_srs_t *readers, htsFile *file_ptr, int autoclose, + const char *idxname); + HTSLIB_EXPORT void bcf_sr_remove_reader(bcf_srs_t *files, int i); diff --git a/htslib/htslib/thread_pool.h b/htslib/htslib/thread_pool.h index b13ccb73..fe012030 100644 --- a/htslib/htslib/thread_pool.h +++ b/htslib/htslib/thread_pool.h @@ -115,6 +115,14 @@ HTSLIB_EXPORT int hts_tpool_size(hts_tpool *p); +/// Return the worker ID index, from 0 to nthreads-1. +/** + * @param p Thread pool + * @return The worker index (0..ntheads-1) or -1 if not found + */ +HTSLIB_EXPORT +int hts_tpool_worker_id(hts_tpool *pool); + /// Add an item to the work pool. /** * @param p Thread pool diff --git a/htslib/htslib/vcf.h b/htslib/htslib/vcf.h index 9a36cab0..105d70d7 100644 --- a/htslib/htslib/vcf.h +++ b/htslib/htslib/vcf.h @@ -2,7 +2,7 @@ /// High-level VCF/BCF variant calling file operations. /* Copyright (C) 2012, 2013 Broad Institute. - Copyright (C) 2012-2020, 2022-2023 Genome Research Ltd. + Copyright (C) 2012-2020, 2022-2025 Genome Research Ltd. Author: Heng Li @@ -950,14 +950,13 @@ set to one of BCF_ERR* codes and must be checked before calling bcf_write(). * The @p string in bcf_update_info_flag() is optional, * @p n indicates whether the flag is set or removed. * - * Note that updating an END info tag will cause line->rlen to be - * updated as a side-effect (removing the tag will set it to the - * string length of the REF allele). If line->pos is being changed as + * Note that updating / removing END,SVLEN info tags will cause line->rlen + * to be recalculated as a side-effect. If line->pos is being changed as * well, it is important that this is done before calling - * bcf_update_info_int32() to update the END tag, otherwise rlen will be + * bcf_update_info_int32() to update the END/SVLEN tag, otherwise rlen will be * set incorrectly. If the new END value is less than or equal to * line->pos, a warning will be printed and line->rlen will be set to - * the length of the REF allele. + * the length of the REF allele for versions upto 4.3. */ #define bcf_update_info_int32(hdr,line,key,values,n) bcf_update_info((hdr),(line),(key),(values),(n),BCF_HT_INT) #define bcf_update_info_float(hdr,line,key,values,n) bcf_update_info((hdr),(line),(key),(values),(n),BCF_HT_REAL) @@ -1002,6 +1001,8 @@ set to one of BCF_ERR* codes and must be checked before calling bcf_write(). * of fixed-length strings. In case of strings with variable length, shorter strings * can be \0-padded. Note that the collapsed strings passed to bcf_update_format_char() * are not \0-terminated. + * With vcf4.5, rlen depends on format field LEN and rlen calculation uses v->pos as well. + * So any change to pos be done before updating LEN that rlen calculated is correct. * * Returns 0 on success or negative value on error. */ @@ -1023,7 +1024,7 @@ set to one of BCF_ERR* codes and must be checked before calling bcf_write(). #define bcf_gt_unphased(idx) (((idx)+1)<<1) #define bcf_gt_missing 0 #define bcf_gt_is_missing(val) ((val)>>1 ? 0 : 1) - #define bcf_gt_is_phased(idx) ((idx)&1) + #define bcf_gt_is_phased(val) ((val)&1) #define bcf_gt_allele(val) (((val)>>1)-1) /** Conversion between alleles indexes to Number=G genotype index (assuming diploid, all 0-based) */ @@ -1501,31 +1502,25 @@ static inline int bcf_float_is_vector_end(float f) return u.i==bcf_float_vector_end ? 1 : 0; } + +/** + * bcf_format_gt_v2 - formats GT information on a string + * @param hdr - bcf header, to get version + * @param fmt - pointer to bcf format data + * @param isample - position of interested sample in data + * @param str - pointer to output string + * Returns 0 on success and -1 on failure + * This method is preferred over bcf_format_gt as this supports vcf4.4 and + * prefixed phasing. Explicit / prefixed phasing for 1st allele is used only + * when it is a must to correctly express phasing. + */ +HTSLIB_EXPORT +int bcf_format_gt_v2(const bcf_hdr_t *hdr, bcf_fmt_t *fmt, int isample, + kstring_t *str) HTS_RESULT_USED; + static inline int bcf_format_gt(bcf_fmt_t *fmt, int isample, kstring_t *str) { - uint32_t e = 0; - #define BRANCH(type_t, convert, missing, vector_end) { \ - uint8_t *ptr = fmt->p + isample*fmt->size; \ - int i; \ - for (i=0; in; i++, ptr += sizeof(type_t)) \ - { \ - type_t val = convert(ptr); \ - if ( val == vector_end ) break; \ - if ( i ) e |= kputc("/|"[val&1], str) < 0; \ - if ( !(val>>1) ) e |= kputc('.', str) < 0; \ - else e |= kputw((val>>1) - 1, str) < 0; \ - } \ - if (i == 0) e |= kputc('.', str) < 0; \ - } - switch (fmt->type) { - case BCF_BT_INT8: BRANCH(int8_t, le_to_i8, bcf_int8_missing, bcf_int8_vector_end); break; - case BCF_BT_INT16: BRANCH(int16_t, le_to_i16, bcf_int16_missing, bcf_int16_vector_end); break; - case BCF_BT_INT32: BRANCH(int32_t, le_to_i32, bcf_int32_missing, bcf_int32_vector_end); break; - case BCF_BT_NULL: e |= kputc('.', str) < 0; break; - default: hts_log_error("Unexpected type %d", fmt->type); return -2; - } - #undef BRANCH - return e == 0 ? 0 : -1; + return bcf_format_gt_v2(NULL, fmt, isample, str); } static inline int bcf_enc_size(kstring_t *s, int size, int type) diff --git a/htslib/m4/hts_hide_dynamic_syms.m4 b/htslib/m4/hts_hide_dynamic_syms.m4 index 62ccb8eb..62e67e1c 100644 --- a/htslib/m4/hts_hide_dynamic_syms.m4 +++ b/htslib/m4/hts_hide_dynamic_syms.m4 @@ -50,6 +50,59 @@ AC_DEFUN([HTS_TEST_CC_C_LD_FLAG], AS_VAR_POPDEF([hts_cv_check_flag])dnl ]) +# SYNOPSIS +# +# HTS_TEST_CC_FLAG(FLAG, FOUND_VAR, REQUIRE_SILENCE) +# +# Test if FLAG can be used on CFLAGS. It it works, +# variable FOUND_VAR is set to FLAG. If REQUIRE_SILENCE is "yes", +# only pass if the compilation did not produce any diagnostics (needed +# to deal with compilers that accept unknown options, generate +# warnings about them but don't exit non-zero, thus breaking the test). + +AC_DEFUN([HTS_TEST_CC_FLAG], + [AS_VAR_PUSHDEF([hts_cv_check_flag],[hts_cv_check_$1])dnl + AC_CACHE_CHECK([whether the compiler accepts $1], + [hts_cv_check_flag], + [ac_check_save_cflags=$CFLAGS + CFLAGS="$CFLAGS $1" + AC_COMPILE_IFELSE([AC_LANG_PROGRAM()], + [AS_IF([test "x$3" = "xyes" && test -s conftest.err], + [AS_VAR_SET([hts_cv_check_flag],[no])], + [AS_VAR_SET([hts_cv_check_flag],[yes]) + AS_IF([test "x$2" != x],[eval AS_TR_SH([$2])="$1"])])], + [AS_VAR_SET([hts_cv_check_flag],[no])]) + CFLAGS=$ac_check_save_cflags]) + AS_VAR_POPDEF([hts_cv_check_flag])dnl +]) + +# SYNOPSIS +# +# HTS_TEST_CC_LD_FLAG(FLAG, FOUND_VAR, REQUIRE_SILENCE) +# +# Test if FLAG can be used on LDFLAGS. It it works, +# variable FOUND_VAR is set to FLAG. If REQUIRE_SILENCE is "yes", +# only pass if the compilation did not produce any diagnostics (needed +# to deal with compilers that accept unknown options, generate +# warnings about them but don't exit non-zero, thus breaking the test). + +AC_DEFUN([HTS_TEST_CC_LD_FLAG], + [AS_VAR_PUSHDEF([hts_cv_check_flag],[hts_cv_check_$1])dnl + AC_CACHE_CHECK([whether the compiler accepts $1], + [hts_cv_check_flag], + [ac_check_save_ldflags=$LDFLAGS + LDFLAGS="$LDFLAGS $1" + AC_LINK_IFELSE([AC_LANG_PROGRAM()], + [AS_IF([test "x$3" = "xyes" && test -s conftest.err], + [AS_VAR_SET([hts_cv_check_flag],[no])], + [AS_VAR_SET([hts_cv_check_flag],[yes]) + AS_IF([test "x$2" != x],[eval AS_TR_SH([$2])="$1"])])], + [AS_VAR_SET([hts_cv_check_flag],[no])]) + LDFLAGS=$ac_check_save_ldflags]) + AS_VAR_POPDEF([hts_cv_check_flag])dnl +]) + + AC_DEFUN([HTS_HIDE_DYNAMIC_SYMBOLS], [ # Test for flags to set default shared library visibility to hidden # -fvisibility=hidden : GCC compatible diff --git a/htslib/sam.c b/htslib/sam.c index 7e58da6e..d2f18371 100644 --- a/htslib/sam.c +++ b/htslib/sam.c @@ -1,6 +1,6 @@ /* sam.c -- SAM and BAM file I/O and manipulation. - Copyright (C) 2008-2010, 2012-2024 Genome Research Ltd. + Copyright (C) 2008-2010, 2012-2025 Genome Research Ltd. Copyright (C) 2010, 2012, 2013 Broad Institute. Author: Heng Li @@ -1964,8 +1964,16 @@ static sam_hdr_t *sam_hdr_create(htsFile* fp) { strncpy(sn, q, r - q); q = r; } else { - if (strncmp(q, "LN:", 3) == 0) - ln = strtoll(q + 3, (char**)&q, 10); + if (strncmp(q, "LN:", 3) == 0) { + hts_pos_t tmp = strtoll(q + 3, (char**)&q, 10); + if (ln != -1 && ln != tmp) { //duplicate & different LN + hts_log_error("Header includes @SQ line \"%s\" with" + " multiple LN: tag with different values.", sn); + goto error; + } else { + ln = tmp; + } + } } while (*q != '\t' && *q != '\n' && *q != '\0') @@ -3216,6 +3224,7 @@ enum sam_cmd { SAM_NONE = 0, SAM_CLOSE, SAM_CLOSE_DONE, + SAM_AT_EOF, }; typedef struct SAM_state { @@ -3319,7 +3328,7 @@ int sam_state_destroy(htsFile *fp) { break; hts_tpool_wake_dispatch(fd->q); pthread_mutex_unlock(&fd->command_m); - usleep(10000); + hts_usleep(10000); pthread_mutex_lock(&fd->command_m); } } @@ -3339,7 +3348,7 @@ int sam_state_destroy(htsFile *fp) { pthread_mutex_unlock(&fd->command_m); while (!ret && fd->q && !hts_tpool_process_empty(fd->q)) { - usleep(10000); + hts_usleep(10000); pthread_mutex_lock(&fd->command_m); ret = -fd->errcode; // not empty but shutdown implies error @@ -3651,6 +3660,7 @@ static void *sam_dispatcher_read(void *vp) { pthread_mutex_unlock(&fd->command_m); } + // Submit a NULL sp_bams entry to act as an EOF marker if (hts_tpool_dispatch(fd->p, fd->q, sam_parse_eof, NULL) < 0) goto err; @@ -4180,6 +4190,7 @@ static int fastq_parse1(htsFile *fp, bam1_t *b) { -1, -1, 0, // mate x->seq.l, x->seq.s, x->qual.s, 0); + if (ret < 0) return -2; // Identify Illumina CASAVA strings. // ::: @@ -4276,7 +4287,7 @@ static inline int sam_read1_sam(htsFile *fp, sam_hdr_t *h, bam1_t *b) { return -2; } if (bgzf_seek(fp->fp.bgzf, fp->fp.bgzf->seeked, SEEK_SET) < 0) - return -1; + return -2; fp->fp.bgzf->seeked = 0; goto err_recover; } @@ -4298,7 +4309,7 @@ static inline int sam_read1_sam(htsFile *fp, sam_hdr_t *h, bam1_t *b) { if (fd->h != h) { hts_log_error("SAM multi-threaded decoding does not support changing header"); - return -1; + return -2; } sp_bams *gb = fd->curr_bam; @@ -4308,14 +4319,25 @@ static inline int sam_read1_sam(htsFile *fp, sam_hdr_t *h, bam1_t *b) { errno = fd->errcode; return -2; } + + pthread_mutex_lock(&fd->command_m); + int cmd = fd->command; + pthread_mutex_unlock(&fd->command_m); + if (cmd == SAM_AT_EOF) + return -1; + hts_tpool_result *r = hts_tpool_next_result_wait(fd->q); if (!r) return -2; fd->curr_bam = gb = (sp_bams *)hts_tpool_result_data(r); hts_tpool_delete_result(r, 0); } - if (!gb) + if (!gb) { + pthread_mutex_lock(&fd->command_m); + fd->command = SAM_AT_EOF; + pthread_mutex_unlock(&fd->command_m); return fd->errcode ? -2 : -1; + } bam1_t *b_array = (bam1_t *)gb->bams; if (fd->curr_idx < gb->nbams) if (!bam_copy1(b, &b_array[fd->curr_idx++])) @@ -4856,8 +4878,8 @@ static inline uint8_t *skip_aux(uint8_t *s, uint8_t *end) switch (size) { case 'Z': case 'H': - while (s < end && *s) ++s; - return s < end ? s + 1 : end; + s = memchr(s, 0, end-s); + return s ? s+1 : end; case 'B': if (end - s < 5) return NULL; size = aux_type2size(*s); ++s; diff --git a/htslib/simd.c b/htslib/simd.c index 865dd887..014a370d 100644 --- a/htslib/simd.c +++ b/htslib/simd.c @@ -42,7 +42,19 @@ DEALINGS IN THE SOFTWARE. */ #if defined __arm__ || defined __aarch64__ -#if defined __linux__ || defined __FreeBSD__ +#ifdef HAVE_OPENBSD +/* + * Extra check for elf_aux_info() on configure-less OpenBSD builds. Once + * version 7.5 has dropped off support, this can be changed to an assumption + * that the function exists in the Makefile-generated config.h. + */ +#include +#if OpenBSD >= 202409 +#define HAVE_ELF_AUX_INFO +#endif +#endif + +#if defined HAVE_GETAUXVAL || defined HAVE_ELF_AUX_INFO #include #elif defined __APPLE__ #include @@ -61,11 +73,11 @@ DEALINGS IN THE SOFTWARE. */ #endif static inline int cpu_supports_neon(void) { -#if defined __linux__ && defined __arm__ && defined HWCAP_NEON +#if defined HAVE_GETAUXVAL && defined __arm__ && defined HWCAP_NEON return (getauxval(AT_HWCAP) & HWCAP_NEON) != 0; -#elif defined __linux__ && defined __arm__ && defined HWCAP_ARM_NEON +#elif defined HAVE_GETAUXVAL && defined __arm__ && defined HWCAP_ARM_NEON return (getauxval(AT_HWCAP) & HWCAP_ARM_NEON) != 0; -#elif defined __linux__ && defined __aarch64__ && defined HWCAP_ASIMD +#elif defined HAVE_GETAUXVAL && defined __aarch64__ && defined HWCAP_ASIMD return (getauxval(AT_HWCAP) & HWCAP_ASIMD) != 0; #elif defined __APPLE__ && defined __aarch64__ int32_t ctl; @@ -73,11 +85,11 @@ static inline int cpu_supports_neon(void) { if (sysctlbyname("hw.optional.AdvSIMD", &ctl, &ctlsize, NULL, 0) != 0) return 0; if (ctlsize != sizeof ctl) return 0; return ctl; -#elif defined __FreeBSD__ && defined __arm__ && defined HWCAP_NEON +#elif defined HAVE_ELF_AUX_INFO && defined __arm__ && defined HWCAP_NEON unsigned long cap; if (elf_aux_info(AT_HWCAP, &cap, sizeof cap) != 0) return 0; return (cap & HWCAP_NEON) != 0; -#elif defined __FreeBSD__ && defined __aarch64__ && defined HWCAP_ASIMD +#elif defined HAVE_ELF_AUX_INFO && defined __aarch64__ && defined HWCAP_ASIMD unsigned long cap; if (elf_aux_info(AT_HWCAP, &cap, sizeof cap) != 0) return 0; return (cap & HWCAP_ASIMD) != 0; diff --git a/htslib/synced_bcf_reader.c b/htslib/synced_bcf_reader.c index 1835ea2d..fb177d27 100644 --- a/htslib/synced_bcf_reader.c +++ b/htslib/synced_bcf_reader.c @@ -1,6 +1,6 @@ /* synced_bcf_reader.c -- stream through multiple VCF files. - Copyright (C) 2012-2023 Genome Research Ltd. + Copyright (C) 2012-2023, 2025 Genome Research Ltd. Author: Petr Danecek @@ -69,6 +69,7 @@ typedef struct { sr_sort_t sort; int regions_overlap, targets_overlap; + int *closefile; // close htsfile with sync reader close or not } aux_t; @@ -251,6 +252,9 @@ void bcf_sr_destroy_threads(bcf_srs_t *files) { int bcf_sr_add_reader(bcf_srs_t *files, const char *fname) { char fmode[5]; + int ret = 0; + const char *idxname = NULL; + strcpy(fmode, "r"); vcf_open_mode(fmode+1, fname, NULL); htsFile* file_ptr = hts_open(fname, fmode); @@ -258,6 +262,22 @@ int bcf_sr_add_reader(bcf_srs_t *files, const char *fname) files->errnum = open_failed; return 0; } + //get idx name and pass to add_hreader + idxname = strstr(fname, HTS_IDX_DELIM); + idxname += idxname ? sizeof(HTS_IDX_DELIM) - 1 : 0; + if (!(ret = bcf_sr_add_hreader(files, file_ptr, 1, idxname))) { + hts_close(file_ptr); //failed, close the file + } + return ret; +} + +int bcf_sr_add_hreader(bcf_srs_t *files, htsFile *file_ptr, int autoclose, const char *idxname) +{ + aux_t *auxdata = NULL; + if ( ! file_ptr ) { + files->errnum = open_failed; + return 0; + } files->has_line = (int*) realloc(files->has_line, sizeof(int)*(files->nreaders+1)); files->has_line[files->nreaders] = 0; @@ -274,7 +294,7 @@ int bcf_sr_add_reader(bcf_srs_t *files, const char *fname) BGZF *bgzf = hts_get_bgzfp(reader->file); if ( bgzf && bgzf_check_EOF(bgzf) == 0 ) { files->errnum = no_eof; - hts_log_warning("No BGZF EOF marker; file '%s' may be truncated", fname); + hts_log_warning("No BGZF EOF marker; file '%s' may be truncated", file_ptr->fn); } if (files->p) bgzf_thread_pool(bgzf, files->p->pool, files->p->qsize); @@ -290,7 +310,7 @@ int bcf_sr_add_reader(bcf_srs_t *files, const char *fname) return 0; } - reader->tbx_idx = tbx_index_load(fname); + reader->tbx_idx = tbx_index_load2(file_ptr->fn, idxname); if ( !reader->tbx_idx ) { files->errnum = idx_load_failed; @@ -309,7 +329,7 @@ int bcf_sr_add_reader(bcf_srs_t *files, const char *fname) reader->header = bcf_hdr_read(reader->file); - reader->bcf_idx = bcf_index_load(fname); + reader->bcf_idx = bcf_index_load2(file_ptr->fn, idxname); if ( !reader->bcf_idx ) { files->errnum = idx_load_failed; @@ -362,7 +382,7 @@ int bcf_sr_add_reader(bcf_srs_t *files, const char *fname) return 0; } - reader->fname = strdup(fname); + reader->fname = strdup(file_ptr->fn); if ( files->apply_filters ) reader->filter_ids = init_filters(reader->header, files->apply_filters, &reader->nfilter_ids); @@ -413,6 +433,18 @@ int bcf_sr_add_reader(bcf_srs_t *files, const char *fname) } } + if ((auxdata = BCF_SR_AUX(files))) { + //store closure status for htsfile + int *tmp = realloc(auxdata->closefile, sizeof(int) * files->nreaders); + if (!tmp) { + hts_log_error("Failed to allocate memory"); + return 0; + } + tmp[files->nreaders - 1] = autoclose; + auxdata->closefile = tmp; + } + + return 1; } @@ -426,13 +458,15 @@ bcf_srs_t *bcf_sr_init(void) return files; } -static void bcf_sr_destroy1(bcf_sr_t *reader) +static void bcf_sr_destroy1(bcf_sr_t *reader, int closefile) { free(reader->fname); if ( reader->tbx_idx ) tbx_destroy(reader->tbx_idx); if ( reader->bcf_idx ) hts_idx_destroy(reader->bcf_idx); bcf_hdr_destroy(reader->header); - hts_close(reader->file); + if (closefile) { + hts_close(reader->file); + } if ( reader->itr ) tbx_itr_destroy(reader->itr); int j; for (j=0; jmbuffer; j++) @@ -445,8 +479,10 @@ static void bcf_sr_destroy1(bcf_sr_t *reader) void bcf_sr_destroy(bcf_srs_t *files) { int i; + int *autoclose = BCF_SR_AUX(files)->closefile; + for (i=0; inreaders; i++) - bcf_sr_destroy1(&files->readers[i]); + bcf_sr_destroy1(&files->readers[i], autoclose[i]); free(files->has_line); free(files->readers); for (i=0; in_smpl; i++) free(files->samples[i]); @@ -456,6 +492,7 @@ void bcf_sr_destroy(bcf_srs_t *files) if (files->tmps.m) free(files->tmps.s); if (files->n_threads) bcf_sr_destroy_threads(files); bcf_sr_sort_destroy(&BCF_SR_AUX(files)->sort); + free(autoclose); free(files->aux); free(files); } @@ -463,12 +500,15 @@ void bcf_sr_destroy(bcf_srs_t *files) void bcf_sr_remove_reader(bcf_srs_t *files, int i) { assert( !files->samples ); // not ready for this yet + int *autoclose = BCF_SR_AUX(files)->closefile; + bcf_sr_sort_remove_reader(files, &BCF_SR_AUX(files)->sort, i); - bcf_sr_destroy1(&files->readers[i]); + bcf_sr_destroy1(&files->readers[i], autoclose[i]); if ( i+1 < files->nreaders ) { memmove(&files->readers[i], &files->readers[i+1], (files->nreaders-i-1)*sizeof(bcf_sr_t)); memmove(&files->has_line[i], &files->has_line[i+1], (files->nreaders-i-1)*sizeof(int)); + memmove(&autoclose[i], &autoclose[i+1], (files->nreaders-i-1)*sizeof(int)); } files->nreaders--; } @@ -1396,7 +1436,7 @@ int bcf_sr_regions_next(bcf_sr_regions_t *reg) } // tabix index absent, reading the whole file - ret = hts_getline(reg->file, KS_SEP_LINE, ®->line); + ret = reg->file ? hts_getline(reg->file, KS_SEP_LINE, ®->line) : -1; if ( ret<0 ) { reg->iseq = -1; return -1; } } ret = _regions_parse_line(reg->line.s, ichr,ifrom,ito, &chr,&chr_end,&from,&to); diff --git a/htslib/tbx.c b/htslib/tbx.c index 66250054..04b69bc2 100644 --- a/htslib/tbx.c +++ b/htslib/tbx.c @@ -1,6 +1,6 @@ /* tbx.c -- tabix API functions. - Copyright (C) 2009, 2010, 2012-2015, 2017-2020, 2022-2023 Genome Research Ltd. + Copyright (C) 2009, 2010, 2012-2015, 2017-2020, 2022-2023, 2025 Genome Research Ltd. Copyright (C) 2010-2012 Broad Institute. Author: Heng Li @@ -96,8 +96,11 @@ int tbx_name2id(tbx_t *tbx, const char *ss) int tbx_parse1(const tbx_conf_t *conf, size_t len, char *line, tbx_intv_t *intv) { size_t i, b = 0; - int id = 1; - char *s; + int id = 1, getlen = 0, alcnt = 0, haveins = 0, lenpos = -1; + char *s, *t; + uint8_t insals[8192]; + int64_t reflen = 0, svlen = 0, fmtlen = 0, tmp = 0; + intv->ss = intv->se = 0; intv->beg = intv->end = -1; for (i = 0; i <= len; ++i) { if (line[i] == '\t' || line[i] == 0) { @@ -165,10 +168,42 @@ int tbx_parse1(const tbx_conf_t *conf, size_t len, char *line, tbx_intv_t *intv) intv->end = intv->beg + l; } } else if ((conf->preset&0xffff) == TBX_VCF) { - if (id == 4) { + if (id == 4) { //ref allele if (b < i) intv->end = intv->beg + (i - b); - } else if (id == 8) { // look for "END=" - int c = line[i]; + ++alcnt; + reflen = i - b; + } if (id == 5) { //alt allele + int lastbyte = 0, c = line[i]; + insals[lastbyte] = 0; + line[i] = 0; + s = line + b; + do { + t = strchr(s, ','); + if (alcnt >> 3 != lastbyte) { //initialize insals + lastbyte = alcnt >> 3; + insals[lastbyte] = 0; + } + ++alcnt; + if (t) { + *t = 0; + } + if (s[0] == '<') { + if (!strcmp("", s)) { //note inserts + insals[lastbyte] |= 1 << ((alcnt - 1) & 7); + haveins = 1; + } else if (!strcmp("<*>", s) || + !strcmp("", s)) { //note gvcf + getlen = 1; + } + } + if (t) { + *t = ','; + s = t + 1; + } + } while (t && alcnt < 65536); //max allcnt is 65535 + line[i] = c; + } else if (id == 8) { //INFO, look for "END=" / "SVLEN" + int c = line[i], d = 1; line[i] = 0; s = strstr(line + b, "END="); if (s == line + b) s += 4; @@ -194,14 +229,85 @@ int tbx_parse1(const tbx_conf_t *conf, size_t len, char *line, tbx_intv_t *intv) intv->end = end; } } + s = strstr(line + b, "SVLEN="); + if (s == line + b) s += 6; //at start of info + else if (s) { //not at the start + s = strstr(line + b, ";SVLEN="); + if (s) s += 7; + } + while (s && d < alcnt) { + t = strchr(s, ','); + if ((haveins) && (insals[d >> 3] & (1 << (d & 7)))) { + tmp = 1; // + } else { + tmp = atoll(s); + tmp = tmp < 0 ? llabs(tmp) : tmp; + } + svlen = svlen < tmp ? tmp : svlen; + s = t ? t + 1 : NULL; + ++d; + } + line[i] = c; + } else if (getlen && id == 9 ) { //FORMAT + int c = line[i], pos = -1; + line[i] = 0; + s = line + b; + while (s) { + ++pos; + if (!(t = strchr(s, ':'))) { //no further fields + if (!strcmp(s, "LEN")) { + lenpos = pos; + } + break; //not present at all! + } else { + *t = '\0'; + if (!strcmp(s, "LEN")) { + lenpos = pos; + *t = ':'; + break; + } + *t = ':'; + s = t + 1; //check next one + } + } + line[i] = c; + if (lenpos == -1) { //not present + break; + } + } else if (id > 9 && getlen && lenpos != -1) { + //get LEN from sample + int c = line[i], d = 0; + line[i] = 0; tmp = 0; + s = line + b; + for (d = 0; d <= lenpos; ++d) { + if (d == lenpos) { + tmp = atoll(s); + break; + } + if ((t = strchr(s, ':'))) { + s = t + 1; + } else { + break; //not in sycn with fmt def! + } + } + fmtlen = fmtlen < tmp ? tmp : fmtlen; line[i] = c; } } } - b = i + 1; + b = i + 1; //beginning if current field ++id; } } + if ((conf->preset&0xffff) == TBX_VCF) { + tmp = reflen < svlen ? + svlen < fmtlen ? fmtlen : svlen : + reflen < fmtlen ? fmtlen : reflen ; + tmp += intv->beg; + intv->end = intv->end < tmp ? tmp : intv->end; + + //NOTE: 'end' calculation be in sync with end/rlen in vcf.c:get_rlen + } if (intv->ss == 0 || intv->se == 0 || intv->beg < 0 || intv->end < 0) return -1; return 0; } @@ -373,8 +479,7 @@ tbx_t *tbx_index(BGZF *fp, int min_shift, const tbx_conf_t *conf) first = 1; } ret = get_intv(tbx, &str, &intv, 1); - if (ret < -1) goto fail; // Out of memory - if (ret < 0) continue; // Skip unparsable lines + if (ret < 0) goto fail; // Out of memory or unparsable lines if (hts_idx_push(tbx->idx, intv.tid, intv.beg, intv.end, bgzf_tell(fp), 1) < 0) { goto fail; diff --git a/htslib/thread_pool.c b/htslib/thread_pool.c index 252a9d24..2370aa82 100644 --- a/htslib/thread_pool.c +++ b/htslib/thread_pool.c @@ -1,6 +1,6 @@ /* thread_pool.c -- A pool of generic worker threads - Copyright (c) 2013-2020 Genome Research Ltd. + Copyright (c) 2013-2020, 2025 Genome Research Ltd. Author: James Bonfield @@ -51,10 +51,13 @@ static void hts_tpool_process_detach_locked(hts_tpool *p, //#define DEBUG -#ifdef DEBUG -static int worker_id(hts_tpool *p) { - int i; +// Return the worker ID index, from 0 to nthreads-1. +// Return <0 on error, but this shouldn't be possible +int hts_tpool_worker_id(hts_tpool *p) { + if (!p) + return -1; pthread_t s = pthread_self(); + int i; for (i = 0; i < p->tsize; i++) { if (pthread_equal(s, p->t[i].tid)) return i; @@ -62,6 +65,7 @@ static int worker_id(hts_tpool *p) { return -1; } +#ifdef DEBUG void DBG_OUT(FILE *fp, char *fmt, ...) { va_list args; va_start(args, fmt); @@ -95,7 +99,7 @@ static int hts_tpool_add_result(hts_tpool_job *j, void *data) { pthread_mutex_lock(&q->p->pool_m); DBG_OUT(stderr, "%d: Adding result to queue %p, serial %"PRId64", %d of %d\n", - worker_id(j->p), q, j->serial, q->n_output+1, q->qsize); + hts_tpool_worker_id(j->p), q, j->serial, q->n_output+1, q->qsize); if (--q->n_processing == 0) pthread_cond_signal(&q->none_processing_c); @@ -129,9 +133,9 @@ static int hts_tpool_add_result(hts_tpool_job *j, void *data) { || q->next_serial == INT_MAX); // ... unless flush in progress. if (r->serial == q->next_serial) { DBG_OUT(stderr, "%d: Broadcasting result_avail (id %"PRId64")\n", - worker_id(j->p), r->serial); + hts_tpool_worker_id(j->p), r->serial); pthread_cond_broadcast(&q->output_avail_c); - DBG_OUT(stderr, "%d: Broadcast complete\n", worker_id(j->p)); + DBG_OUT(stderr, "%d: Broadcast complete\n", hts_tpool_worker_id(j->p)); } pthread_mutex_unlock(&q->p->pool_m); @@ -603,7 +607,7 @@ static void *tpool_worker(void *arg) { pthread_mutex_unlock(&p->pool_m); DBG_OUT(stderr, "%d: Processing queue %p, serial %"PRId64"\n", - worker_id(j->p), q, j->serial); + hts_tpool_worker_id(j->p), q, j->serial); if (hts_tpool_add_result(j, j->func(j->arg)) < 0) goto err; @@ -625,13 +629,13 @@ static void *tpool_worker(void *arg) { shutdown: pthread_mutex_unlock(&p->pool_m); #ifdef DEBUG - fprintf(stderr, "%d: Shutting down\n", worker_id(p)); + fprintf(stderr, "%d: Shutting down\n", hts_tpool_worker_id(p)); #endif return NULL; err: #ifdef DEBUG - fprintf(stderr, "%d: Failed to add result\n", worker_id(p)); + fprintf(stderr, "%d: Failed to add result\n", hts_tpool_worker_id(p)); #endif // Hard failure, so shutdown all queues pthread_mutex_lock(&p->pool_m); @@ -1154,6 +1158,7 @@ void hts_tpool_kill(hts_tpool *p) { #ifdef TEST_MAIN #include +#include "hts_internal.h" #ifndef TASK_SIZE #define TASK_SIZE 1000 @@ -1166,7 +1171,7 @@ void hts_tpool_kill(hts_tpool *p) { void *doit_square_u(void *arg) { int job = *(int *)arg; - usleep(random() % 100000); // to coerce job completion out of order + hts_usleep(random() % 100000); // to coerce job completion out of order printf("RESULT: %d\n", job*job); @@ -1207,7 +1212,7 @@ void *doit_square(void *arg) { // One excessively slow, to stress test output queue filling and // excessive out of order scenarios. - usleep(500000 * ((job&31)==31) + random() % 10000); + hts_usleep(500000 * ((job&31)==31) + random() % 10000); res = malloc(sizeof(*res)); *res = (job<0) ? -job*job : job*job; @@ -1253,7 +1258,7 @@ int test_square(int n) { // The alternative is a separate thread for dispatching and/or // consumption of results. See test_squareB. putchar('.'); fflush(stdout); - usleep(10000); + hts_usleep(10000); } } while (blk == -1); } @@ -1404,7 +1409,7 @@ static void *pipe_stage1(void *arg) { pipe_job *j = (pipe_job *)arg; j->x <<= 8; - usleep(random() % 10000); // fast job + hts_usleep(random() % 10000); // fast job printf("1 %08x\n", j->x); return j; @@ -1430,7 +1435,7 @@ static void *pipe_stage2(void *arg) { pipe_job *j = (pipe_job *)arg; j->x <<= 8; - usleep(random() % 100000); // slow job + hts_usleep(random() % 100000); // slow job printf("2 %08x\n", j->x); return j; @@ -1455,7 +1460,7 @@ static void *pipe_stage2to3(void *arg) { static void *pipe_stage3(void *arg) { pipe_job *j = (pipe_job *)arg; - usleep(random() % 10000); // fast job + hts_usleep(random() % 10000); // fast job j->x <<= 8; return j; } diff --git a/htslib/vcf.c b/htslib/vcf.c index 105c7539..ca2055fb 100644 --- a/htslib/vcf.c +++ b/htslib/vcf.c @@ -1,7 +1,7 @@ /* vcf.c -- VCF/BCF API functions. Copyright (C) 2012, 2013 Broad Institute. - Copyright (C) 2012-2024 Genome Research Ltd. + Copyright (C) 2012-2025 Genome Research Ltd. Portions copyright (C) 2014 Intel Corporation. Author: Heng Li @@ -116,6 +116,7 @@ typedef struct vdict_t dict; // bcf_hdr_t.dict[0] vdict_t dictionary which keeps bcf_idinfo_t for BCF_HL_FLT,BCF_HL_INFO,BCF_HL_FMT hdict_t *gen; // hdict_t dictionary which keeps bcf_hrec_t* pointers for generic and structured fields size_t *key_len;// length of h->id[BCF_DT_ID] strings + int version; //cached version } bcf_hdr_aux_t; @@ -124,6 +125,70 @@ static inline bcf_hdr_aux_t *get_hdr_aux(const bcf_hdr_t *hdr) return (bcf_hdr_aux_t *)hdr->dict[0]; } +//version macros +#define VCF_DEF 4002000 +#define VCF44 4004000 +#define VCF45 4005000 + +#define VCF_MAJOR_VER(x) ( (x) / 10000 / 100 ) +#define VCF_MINOR_VER(x) ( ((x) % 1000000) / 1000 ) + +/** + * bcf_get_version - get the version as int + * @param hdr - bcf header, to get version + * @param verstr- version string, which is already available + * Returns version on success and default version on failure + * version = major * 100 * 10000 + minor * 1000 + */ +static int bcf_get_version(const bcf_hdr_t *hdr, const char *verstr) +{ + const char *version = NULL, vcf[] = "VCFv"; + char *major = NULL, *minor = NULL; + int ver = -1; + long tmp = 0; + bcf_hdr_aux_t *aux = NULL; + + if (!hdr && !verstr) { //invalid input + goto fail; + } + + if (hdr) { + if ((aux = get_hdr_aux(hdr)) && aux->version != 0) { //use cached version + return aux->version; + } + //get from header + version = bcf_hdr_get_version(hdr); + } else { + //get from version string + version = verstr; + } + if (!(major = strstr(version, vcf))) { //bad format + goto fail; + } + major += sizeof(vcf) - 1; + if (!(minor = strchr(major, '.'))) { //bad format + goto fail; + } + tmp = strtol(major, NULL, 10); + if ((!tmp && errno == EINVAL) || + ((tmp == LONG_MIN || tmp == LONG_MAX) && errno == ERANGE)) { //failed + goto fail; + } + ver = tmp * 100 * 10000; + tmp = strtol(++minor, NULL, 10); + if ((!tmp && errno == EINVAL) || + ((tmp == LONG_MIN || tmp == LONG_MAX) && errno == ERANGE)) { //failed + goto fail; + } + ver += tmp * 1000; + return ver; + +fail: + hts_log_warning("Couldn't get VCF version, considering as %d.%d", + VCF_MAJOR_VER(VCF_DEF), VCF_MINOR_VER(VCF_DEF)); + return VCF_DEF; +} + static char *find_chrom_header_line(char *s) { char *nl; @@ -132,6 +197,8 @@ static char *find_chrom_header_line(char *s) else return NULL; } +static int64_t get_rlen(const bcf_hdr_t *h, bcf1_t *v); + /************************* *** VCF header parser *** *************************/ @@ -985,7 +1052,6 @@ static void bcf_hdr_remove_from_hdict(bcf_hdr_t *hdr, bcf_hrec_t *hrec) int bcf_hdr_update_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec, const bcf_hrec_t *tmp) { - // currently only for bcf_hdr_set_version assert( hrec->type==BCF_HL_GEN ); int ret; khint_t k; @@ -1014,6 +1080,12 @@ int bcf_hdr_update_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec, const bcf_hrec_t *tmp) free(hrec->value); hrec->value = strdup(tmp->value); if ( !hrec->value ) return -1; + kh_val(aux->gen,k) = hrec; + + if (!strcmp(hrec->key,"fileformat")) { + //update version + get_hdr_aux(hdr)->version = bcf_get_version(NULL, hrec->value); + } return 0; } @@ -1037,7 +1109,6 @@ int bcf_hdr_add_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec) bcf_hrec_destroy(hrec); return 0; } - // Is one of the generic fields and already present? if ( ksprintf(&str, "##%s=%s", hrec->key,hrec->value) < 0 ) { @@ -1052,6 +1123,9 @@ int bcf_hdr_add_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec) free(str.s); return 0; } + if (!strcmp(hrec->key, "fileformat")) { + aux->version = bcf_get_version(NULL, hrec->value); + } } int i; @@ -1387,6 +1461,8 @@ int bcf_hdr_set_version(bcf_hdr_t *hdr, const char *version) if ( ksprintf(&str,"##fileformat=%s", version) < 0 ) return -1; hrec = bcf_hdr_parse_line(hdr, str.s, &len); free(str.s); + + get_hdr_aux(hdr)->version = bcf_get_version(NULL, hrec->value); } else { @@ -1399,6 +1475,7 @@ int bcf_hdr_set_version(bcf_hdr_t *hdr, const char *version) bcf_hrec_destroy(tmp); } hdr->dirty = 1; + //TODO rlen may change, deal with it return 0; // FIXME: check for errs in this function (return < 0 if so) } @@ -1420,6 +1497,7 @@ bcf_hdr_t *bcf_hdr_init(const char *mode) if ( (aux->gen = kh_init(hdict))==NULL ) { free(aux); goto fail; } aux->key_len = NULL; aux->dict = *((vdict_t*)h->dict[0]); + aux->version = 0; free(h->dict[0]); h->dict[0] = aux; @@ -1428,6 +1506,7 @@ bcf_hdr_t *bcf_hdr_init(const char *mode) bcf_hdr_append(h, "##fileformat=VCFv4.2"); // The filter PASS must appear first in the dictionary bcf_hdr_append(h, "##FILTER="); + aux->version = VCF_DEF; } return h; @@ -1741,7 +1820,6 @@ static int bcf_record_check(const bcf_hdr_t *hdr, bcf1_t *rec) { uint32_t err = 0; int type = 0; int num = 0; - int reflen = 0; uint32_t i, reports; const uint32_t is_integer = ((1 << BCF_BT_INT8) | (1 << BCF_BT_INT16) | @@ -1790,7 +1868,6 @@ static int bcf_record_check(const bcf_hdr_t *hdr, bcf1_t *rec) { hts_log_warning("Bad BCF record at %s:%"PRIhts_pos": Invalid %s type %d (%s)", bcf_seqname_safe(hdr,rec), rec->pos+1, "REF/ALT", type, get_type_name(type)); err |= BCF_ERR_CHAR; } - if (i == 0) reflen = num; bytes = (size_t) num << bcf_type_shift[type]; if (end - ptr < bytes) goto bad_shared; ptr += bytes; @@ -1879,7 +1956,9 @@ static int bcf_record_check(const bcf_hdr_t *hdr, bcf1_t *rec) { bcf_seqname_safe(hdr,rec), rec->pos+1, rec->rlen); warned = 1; } - rec->rlen = reflen >= 0 ? reflen : 0; + //find rlen considering reflen, END, SVLEN, fmt LEN + hts_pos_t len = get_rlen(hdr, rec); + rec->rlen = len >= 0 ? len : 0; } rec->errcode |= err; @@ -3061,8 +3140,10 @@ static int vcf_parse_format_fill5(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, const char *t = q + 1; int m = 0; // m: sample id const int nsamples = bcf_hdr_nsamples(h); - const char *end = s->s + s->l; + + int ver = bcf_get_version(h, NULL); + while ( tis_gt) { // Genotypes. - // ([|/])+... where is [0-9]+ or ".". + //([/|])?)([|/])+... where is [0-9]+ or ".". int32_t is_phased = 0; uint32_t *x = (uint32_t*)(z->buf + z->size * (size_t)m); uint32_t unreadable = 0; uint32_t max = 0; - int overflow = 0; + int overflow = 0, ploidy = 0, anyunphased = 0, \ + phasingprfx = 0, unknown1 = 0; + + /* with prefixed phasing, it is explicitly given for 1st one + with non-prefixed, set based on ploidy and phasing of other + alleles. */ + if (ver >= VCF44 && (*t == '|' || *t == '/')) { + // cache prefix and phasing status + is_phased = *t++ == '|'; + phasingprfx = 1; + } + for (l = 0;; ++t) { + ploidy++; if (*t == '.') { ++t, x[l++] = is_phased; + if (l==1) { //for 1st allele only + unknown1 = 1; + } } else { const char *tt = t; uint32_t val; @@ -3125,9 +3221,21 @@ static int vcf_parse_format_fill5(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, if (max < val) max = val; x[l++] = (val + 1) << 1 | is_phased; } + anyunphased |= (ploidy != 1) && !is_phased; is_phased = (*t == '|'); if (*t != '|' && *t != '/') break; } + if (ver >= VCF44 && !phasingprfx) { + /* no explicit phasing for 1st allele, set based on + other alleles and ploidy */ + if (ploidy == 1) { //implicitly phased + if (!unknown1) { + x[0] |= 1; + } + } else { //set by other unphased alleles + x[0] |= (anyunphased)? 0 : 1; + } + } // Possibly check max against v->n_allele instead? if (overflow || max > (INT32_MAX >> 1) - 1) { hts_log_error("Couldn't read GT data: value too large at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1); @@ -3609,8 +3717,6 @@ static int vcf_parse_info(kstring_t *str, const bcf_hdr_t *h, bcf1_t *v, char *p negative_rlen_warned = 1; } } - else - v->rlen = val1 - v->pos; } } else if ((y>>4&0xf) == BCF_HT_REAL) { float *val_f = (float *)a_val; @@ -3662,7 +3768,7 @@ int vcf_parse(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v) // parsing. Eg to do memcmp(key, "END", 4) in vcf_parse_info over // the more straight forward looking strcmp, giving a speed advantage. if (ks_resize(s, s->l+4) < 0) - return -1; + return -2; // Force our memory to be initialised so we avoid the technicality of // undefined behaviour in using a 4-byte memcmp. (The reality is this @@ -3792,12 +3898,13 @@ int vcf_parse(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v) if (p) { *(q = (char*)aux.p) = 0; - return vcf_parse_format(s, h, v, p, q) == 0 ? 0 : -2; - } else { - return 0; + if (vcf_parse_format(s, h, v, p, q)) { + goto err; + } } end: + v->rlen = get_rlen(h, v); //set rlen based on version ret = 0; err: @@ -4129,7 +4236,7 @@ int vcf_format(const bcf_hdr_t *h, const bcf1_t *v, kstring_t *s) uint8_t *ptr = (uint8_t *)v->indiv.s; int gt_i = -1; bcf_fmt_t *fmt = v->d.fmt; - int first = 1; + int first = 1, ret = 0; int fmt_packed = !(v->unpacked & BCF_UN_FMT); if (fmt_packed) { @@ -4165,6 +4272,8 @@ int vcf_format(const bcf_hdr_t *h, const bcf1_t *v, kstring_t *s) if (!id || !id->key) { hts_log_error("Invalid BCF, the FORMAT tag id=%d at %s:%"PRIhts_pos" not present in the header", z->id, bcf_seqname_safe(h, v), v->pos+1); errno = EINVAL; + if (fmt_packed) + free(fmt); return -1; } @@ -4187,7 +4296,13 @@ int vcf_format(const bcf_hdr_t *h, const bcf1_t *v, kstring_t *s) if (!first) kputc_(':', s); first = 0; if (gt_i == i) { - bcf_format_gt(f,j,s); + if ((ret = bcf_format_gt_v2(h, f,j,s)) < 0) { + hts_log_error("Failed to format GT value for sample %d, returned %d", i, ret); + errno = EINVAL; + if (fmt_packed) + free(fmt); + return -1; + } break; } else if (f->n == 1) @@ -4588,6 +4703,17 @@ bcf_hdr_t *bcf_hdr_merge(bcf_hdr_t *dst, const bcf_hdr_t *src) if (res < 0) return NULL; need_sync += res; } + else if ( !strcmp(src->hrec[i]->key,"fileformat") ) + { + int ver_src = bcf_get_version(src,src->hrec[i]->value); + int ver_dst = bcf_get_version(dst,dst->hrec[j]->value); + if ( ver_src > ver_dst ) + { + if (bcf_hdr_set_version(dst,src->hrec[i]->value) < 0) + return NULL; + need_sync = 1; + } + } } else if ( src->hrec[i]->type==BCF_HL_STR ) { @@ -5025,8 +5151,8 @@ static void bcf_set_variant_type(const char *ref, const char *alt, bcf_variant_t if ( *a && !*r ) { - if ( *a==']' || *a=='[' ) { var->type = VCF_BND; return; } // "joined after" breakend while ( *a ) a++; + if ( *(a-1)==']' || *(a-1)=='[' ) { var->type = VCF_BND; return; } // "joined after" breakend var->n = (a-alt)-(r-ref); var->type = VCF_INDEL | VCF_INS; return; } else if ( *r && !*a ) @@ -5042,6 +5168,7 @@ static void bcf_set_variant_type(const char *ref, const char *alt, bcf_variant_t const char *re = r, *ae = a; while ( re[1] ) re++; while ( ae[1] ) ae++; + if ( ae[0]==']' || ae[0]=='[' ) { var->type = VCF_BND; return; } // "joined after" breakend while ( re>r && ae>a && toupper_c(*re)==toupper_c(*ae) ) { re--; ae--; } if ( ae==a ) { @@ -5161,13 +5288,14 @@ int bcf_has_variant_types(bcf1_t *rec, uint32_t bitmask, else return bitmask & type; } // mode == bcf_match_exact + if ( bitmask==VCF_REF ) return type==bitmask ? 1 : 0; return type==bitmask ? type : 0; } int bcf_update_info(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const void *values, int n, int type) { static int negative_rlen_warned = 0; - int is_end_tag; + int is_end_tag, is_svlen_tag = 0; // Is the field already present? int i, inf_id = bcf_hdr_id2int(hdr,BCF_DT_ID,key); @@ -5175,6 +5303,7 @@ int bcf_update_info(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const v if ( !(line->unpacked & BCF_UN_INFO) ) bcf_unpack(line, BCF_UN_INFO); is_end_tag = strcmp(key, "END") == 0; + is_svlen_tag = strcmp(key, "SVLEN") == 0; for (i=0; in_info; i++) if ( inf_id==line->d.info[i].key ) break; @@ -5182,8 +5311,6 @@ int bcf_update_info(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const v if ( !n || (type==BCF_HT_STR && !values) ) { - if ( n==0 && is_end_tag ) - line->rlen = line->n_allele ? strlen(line->d.allele[0]) : 0; if ( inf ) { // Mark the tag for removal, free existing memory if necessary @@ -5196,6 +5323,9 @@ int bcf_update_info(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const v inf->vptr = NULL; inf->vptr_off = inf->vptr_len = 0; } + if ( n==0 && (is_end_tag || is_svlen_tag) ) { + line->rlen = get_rlen(hdr, line); + } return 0; } @@ -5291,12 +5421,12 @@ int bcf_update_info(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const v hts_log_warning("INFO/END=%"PRIhts_pos" is smaller than POS at %s:%"PRIhts_pos,end,bcf_seqname_safe(hdr,line),line->pos+1); negative_rlen_warned = 1; } - line->rlen = line->n_allele ? strlen(line->d.allele[0]) : 0; } - else - line->rlen = end - line->pos; } } + if (is_svlen_tag || is_end_tag) { + line->rlen = get_rlen(hdr, line); + } return 0; } @@ -5330,6 +5460,7 @@ int bcf_update_format(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const { // Is the field already present? int i, fmt_id = bcf_hdr_id2int(hdr,BCF_DT_ID,key); + int is_len = 0; if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,fmt_id) ) { if ( !n ) return 0; @@ -5342,6 +5473,7 @@ int bcf_update_format(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const if ( line->d.fmt[i].id==fmt_id ) break; bcf_fmt_t *fmt = i==line->n_fmt ? NULL : &line->d.fmt[i]; + is_len = strcmp(key, "LEN") == 0; if ( !n ) { if ( fmt ) @@ -5355,6 +5487,9 @@ int bcf_update_format(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const line->d.indiv_dirty = 1; fmt->p = NULL; } + if (is_len) { + line->rlen = get_rlen(hdr, line); + } return 0; } @@ -5427,6 +5562,10 @@ int bcf_update_format(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const } } line->unpacked |= BCF_UN_FMT; + + if (is_len) { + line->rlen = get_rlen(hdr, line); + } return 0; } @@ -5494,6 +5633,7 @@ int bcf_has_filter(const bcf_hdr_t *hdr, bcf1_t *line, char *filter) static inline int _bcf1_sync_alleles(const bcf_hdr_t *hdr, bcf1_t *line, int nals) { line->d.shared_dirty |= BCF1_DIRTY_ALS; + line->d.var_type = -1; line->n_allele = nals; hts_expand(char*, line->n_allele, line->d.m_allele, line->d.allele); @@ -5507,20 +5647,8 @@ static inline int _bcf1_sync_alleles(const bcf_hdr_t *hdr, bcf1_t *line, int nal als++; n++; } - // Update REF length. Note that END is 1-based while line->pos 0-based - bcf_info_t *end_info = bcf_get_info(hdr,line,"END"); - if ( end_info ) - { - if ( end_info->type==BCF_HT_INT && end_info->v1.i==bcf_int32_missing ) end_info = NULL; - else if ( end_info->type==BCF_HT_LONG && end_info->v1.i==bcf_int64_missing ) end_info = NULL; - } - if ( end_info && end_info->v1.i > line->pos ) - line->rlen = end_info->v1.i - line->pos; - else if ( nals > 0 ) - line->rlen = strlen(line->d.allele[0]); - else - line->rlen = 0; + line->rlen = get_rlen(hdr, line); return 0; } @@ -5950,3 +6078,321 @@ const char *bcf_strerror(int errorcode, char *buffer, size_t maxbuffer) { return buffer; } +/** + * bcf_format_gt_v2 - formats GT information on a string + * @param hdr - bcf header, to get version + * @param fmt - pointer to bcf format data + * @param isample - position of interested sample in data + * @param str - pointer to output string + * Returns 0 on success and -1 on failure + * This method is preferred over bcf_format_gt as this supports vcf4.4 and + * prefixed phasing. Explicit / prefixed phasing for 1st allele is used only + * when it is a must to correctly express phasing. + * correctly express phasing. + */ +int bcf_format_gt_v2(const bcf_hdr_t *hdr, bcf_fmt_t *fmt, int isample, kstring_t *str) +{ + uint32_t e = 0; + int ploidy = 1, anyunphased = 0; + int32_t val0 = 0; + size_t pos = str ? str->l : 0; + + #define BRANCH(type_t, convert, missing, vector_end) { \ + uint8_t *ptr = fmt->p + isample*fmt->size; \ + int i; \ + for (i=0; in; i++, ptr += sizeof(type_t)) \ + { \ + type_t val = convert(ptr); \ + if ( val == vector_end ) break; \ + if (!i) { val0 = val; } \ + if (i) { \ + e |= kputc("/|"[val & 1], str) < 0; \ + anyunphased |= !(val & 1); \ + } \ + if (!(val >> 1)) e |= kputc('.', str) < 0; \ + else e |= kputw((val >> 1) - 1, str) < 0; \ + } \ + if (i == 0) e |= kputc('.', str) < 0; \ + ploidy = i; \ + } + switch (fmt->type) { + case BCF_BT_INT8: BRANCH(int8_t, le_to_i8, bcf_int8_missing, + bcf_int8_vector_end); break; + case BCF_BT_INT16: BRANCH(int16_t, le_to_i16, bcf_int16_missing, + bcf_int16_vector_end); break; + case BCF_BT_INT32: BRANCH(int32_t, le_to_i32, bcf_int32_missing, + bcf_int32_vector_end); break; + case BCF_BT_NULL: e |= kputc('.', str) < 0; break; + default: hts_log_error("Unexpected type %d", fmt->type); return -2; + } + #undef BRANCH + + if (hdr && get_hdr_aux(hdr)->version >= VCF44) { + //output which supports prefixed phasing + + /* update 1st allele's phasing if required and append rest to it. + use prefixed phasing only when it is a must. i.e. without which the + inferred value will be incorrect */ + if (val0 & 1) { + /* 1st one is phased, if ploidy is > 1 and an unphased allele exists + need to specify explicitly */ + e |= (ploidy > 1 && anyunphased) ? + (kinsert_char('|', pos, str) < 0) : + (ploidy <= 1 && !((val0 >> 1)) ? //|. needs explicit o/p + (kinsert_char('|', pos, str) < 0) : + 0); + } else { + /* 1st allele is unphased, if ploidy is = 1 or allele is '.' or + ploidy > 1 and no other unphased allele exist, need to specify + explicitly */ + e |= ((ploidy <= 1 && val0 != 0) || (ploidy > 1 && !anyunphased)) ? + (kinsert_char('/', pos, str) < 0) : + 0; + } + } + return e == 0 ? 0 : -1; +} + +/** + * get_rlen - calculates and returns rlen value + * @param h - bcf header + * @param v - bcf data + * Returns rlen calculated on success and -1 on failure. + * rlen calculation is dependent on vcf version and a few other field data. + * When bcf decoded data is available, refers it. When not available, retrieves + * required field data by seeking on the data stream. + * Ideally pos & version be set appropriately before any info/format field + * update to have proper rlen calculation. + * As version is not kept properly updated in practice, it is ignored in calcs. + */ +static int64_t get_rlen(const bcf_hdr_t *h, bcf1_t *v) +{ + uint8_t *f = (uint8_t*)v->shared.s, *t = NULL, + *e = (uint8_t*)v->shared.s + v->shared.l; + int size, type, id, lenid, endid, svlenid, i, bad, gvcf = 0, haveins = 0; + bcf_info_t *endinfo = NULL, *svleninfo = NULL, end_lcl, svlen_lcl; + bcf_fmt_t *lenfmt = NULL, len_lcl; + + //holds allele status for the max no of alleles + uint8_t insals[8192]; + //pos from info END, fmt LEN, info SVLEN + hts_pos_t end = 0, end_fmtlen = 0, end_svlen = 0, hpos; + int64_t len_ref = 0, len = 0, tmp; + lenid = bcf_hdr_id2int(h, BCF_DT_ID, "LEN"); + svlenid = bcf_hdr_id2int(h, BCF_DT_ID, "SVLEN"); + endid = bcf_hdr_id2int(h, BCF_DT_ID, "END"); + + //initialise bytes which are to be used + memset(insals, 0, 1 + v->n_allele / 8); + + //use decoded data where ever available and where not, get from stream + if (v->unpacked & BCF_UN_STR || v->d.shared_dirty & BCF1_DIRTY_ALS) { + for (i = 1; i < v->n_allele; ++i) { + //checks only alt alleles, with NUL + if (!strcmp(v->d.allele[i], "")) { + //ins allele, note to skip corresponding svlen val + insals[i >> 3] |= 1 << (i & 7); + haveins = 1; + } else if (!strcmp(v->d.allele[i], "<*>") || + !strcmp(v->d.allele[i], "")) { + gvcf = 1; //gvcf present, have to check for LEN field + } + } + f += v->unpack_size[0] + v->unpack_size[1]; + len_ref = v->n_allele ? strlen(v->d.allele[0]) : 0; + } else if (f < e) { + //skip ID + size = bcf_dec_size(f, &f, &type); + f += size << bcf_type_shift[type]; + // REF, ALT + for (i = 0; i < v->n_allele; ++i) { + //check all alleles, w/o NUL + size = bcf_dec_size(f, &f, &type); + if (!i) { //REF length + len_ref = size; + } else { + if (size == 5 && !strncmp((char*)f, "", size)) { + //ins allele, note to skip corresponding svlen val + insals[i >> 3] |= 1 << (i & 7); + haveins = 1; + } else if ((size == 3 && !strncmp((char*)f, "<*>", size)) || + (size == 9 && !strncmp((char*)f, "", size))) { + gvcf = 1; //gvcf present, have to check for LEN field + } + } + f += size << bcf_type_shift[type]; + } + } + // FILTER + if (v->unpacked & BCF_UN_FLT) { + f += v->unpack_size[2]; + } else if (f < e) { + size = bcf_dec_size(f, &f, &type); + f += size << bcf_type_shift[type]; + } + // INFO + if (svlenid >= 0 || endid >= 0 ) { //only if end/svlen present + if (v->unpacked & BCF_UN_INFO || v->d.shared_dirty & BCF1_DIRTY_INF) { + endinfo = bcf_get_info(h, v, "END"); + svleninfo = bcf_get_info(h, v, "SVLEN"); + } else if (f < e) { + for (i = 0; i < v->n_info; ++i) { + id = bcf_dec_typed_int1(f, &t); + if (id == endid) { //END + t = bcf_unpack_info_core1(f, &end_lcl); + endinfo = &end_lcl; + if (svleninfo || svlenid < 0) { + break; //already got svlen or no need to search further + } + } else if (id == svlenid) { //SVLEN + t = bcf_unpack_info_core1(f, &svlen_lcl); + svleninfo = &svlen_lcl; + if (endinfo || endid < 0 ) { + break; //already got end or no need to search further + } + } else { + f = t; + size = bcf_dec_size(f, &t, &type); + t += size << bcf_type_shift[type]; + } + f = t; + } + } + } + // FORMAT + if (lenid >= 0 && gvcf) { + //with LEN and has gvcf allele + f = (uint8_t*)v->indiv.s; t = NULL; e = (uint8_t*)v->indiv.s + v->indiv.l; + if (v->unpacked & BCF_UN_FMT || v->d.indiv_dirty) { + lenfmt = bcf_get_fmt(h, v, "LEN"); + } else if (f < e) { + for (i = 0; i < v->n_fmt; ++i) { + id = bcf_dec_typed_int1(f, &t); + if (id == lenid) { + t = bcf_unpack_fmt_core1(f, v->n_sample, &len_lcl); + lenfmt = &len_lcl; + break; //that's all needed + } else { + f = t; + size = bcf_dec_size(f, &t, &type); + t += size * v->n_sample << bcf_type_shift[type]; + } + f = t; + } + } + } + //got required data, find end and rlen + if (endinfo && endinfo->vptr) { //end position given by info END + //end info exists, not being deleted + end = endinfo->v1.i; + switch(endinfo->type) { + case BCF_BT_INT8: end = end == bcf_int8_missing ? 0 : end; break; + case BCF_BT_INT16: end = end == bcf_int16_missing ? 0 : end; break; + case BCF_BT_INT32: end = end == bcf_int32_missing ? 0 : end; break; + case BCF_BT_INT64: end = end == bcf_int64_missing ? 0 : end; break; + default: end = 0; break; //invalid + } + } + + if (svleninfo && svleninfo->vptr) { + //svlen info exists, not being deleted + bad = 0; + //get largest svlen, except ; expects to be . for non SV alleles + for (i = 0; i < svleninfo->len && i + 1 < v->n_allele; ++i) { + switch(svleninfo->type) { + case BCF_BT_INT8: + tmp = ((int8_t*)svleninfo->vptr)[i]; + tmp = tmp == bcf_int8_missing ? 0 : tmp; + break; + case BCF_BT_INT16: + tmp = ((int16_t*)svleninfo->vptr)[i]; + tmp = tmp == bcf_int16_missing ? 0 : tmp; + break; + case BCF_BT_INT32: + tmp = ((int32_t*)svleninfo->vptr)[i]; + tmp = tmp == bcf_int32_missing ? 0 : tmp; + break; + case BCF_BT_INT64: + tmp = ((int64_t*)svleninfo->vptr)[i]; + tmp = tmp == bcf_int64_missing ? 0 : tmp; + break; + default: //invalid + tmp = 0; + bad = 1; + break; + } + if (bad) { //stop svlen check + len = 0; + break; + } + //expects only SV will have valid svlen and rest have '.' + if ((haveins) && (insals[i >> 3] & (1 << ((i + 1) & 7)))) { + continue; //skip svlen for + } + tmp = tmp < 0 ? llabs(tmp) : tmp; + if (len < tmp) len = tmp; + } + } + if ((!svleninfo || !len) && end) { //no svlen, infer from end + len = end > v->pos ? end - v->pos - 1 : 0; + } + end_svlen = v->pos + len + 1; //end position found from SVLEN + + len = 0; + if (lenfmt && lenfmt->p) { + //fmt len exists, not being deleted, has gvcf and version >= 4.5 + int j = 0; + int64_t offset = 0; + bad = 0; + for (i = 0; i < v->n_sample; ++i) { + for (j = 0; j < lenfmt->n; ++j) { + switch(lenfmt->type) { + case BCF_BT_INT8: + tmp = (((int8_t*)lenfmt->p + offset))[j]; + tmp = tmp == bcf_int8_missing ? 0 : tmp; + break; + case BCF_BT_INT16: + tmp = ((int16_t*)(lenfmt->p + offset))[j]; + tmp = tmp == bcf_int16_missing ? 0 : tmp; + break; + case BCF_BT_INT32: + tmp = ((int32_t*)(lenfmt->p + offset))[j]; + tmp = tmp == bcf_int32_missing ? 0 : tmp; + break; + case BCF_BT_INT64: + tmp = ((int64_t*)(lenfmt->p + offset))[j]; + tmp = tmp == bcf_int64_missing ? 0 : tmp; + break; + default: //invalid + bad = 1; + break; + } + if (bad) { //stop LEN check + len = 0; + break; + } + //assumes only gvcf have valid LEN + if (len < tmp) len = tmp; + } + offset += j << bcf_type_shift[lenfmt->type]; + } + } + if ((!lenfmt || !len) && end) { //no fmt len, infer from end + len = end > v->pos ? end - v->pos : 0; + } + end_fmtlen = v->pos + len; //end position found from LEN + + //get largest pos, based on END, SVLEN, fmt LEN and length using it + hpos = end < end_svlen ? + end_svlen < end_fmtlen ? end_fmtlen : end_svlen : + end < end_fmtlen ? end_fmtlen : end; + len = hpos - v->pos; + + //NOTE: 'end' calculation be in sync with tbx.c:tbx_parse1 + + /* rlen to be calculated based on version, END, SVLEN, fmt LEN, ref len. + Relevance of these fields vary across different vcf versions. + Many times, these info/fmt fields are used without version updates; + hence these fields are used for calculation disregarding vcf version */ + return len < len_ref ? len_ref : len; +} diff --git a/htslib/version.sh b/htslib/version.sh index f35234c2..8946192f 100755 --- a/htslib/version.sh +++ b/htslib/version.sh @@ -24,7 +24,7 @@ # DEALINGS IN THE SOFTWARE. # Master version, for use in tarballs or non-git source copies -VERSION=1.21 +VERSION=1.22 # If we have a git clone, then check against the current tag srcdir=${0%/version.sh} diff --git a/pysam/samtools.py b/pysam/samtools.py index 56f6fb07..97efd675 100644 --- a/pysam/samtools.py +++ b/pysam/samtools.py @@ -8,6 +8,7 @@ bedcov = pysam.utils.PysamDispatcher('samtools', 'bedcov') calmd = pysam.utils.PysamDispatcher('samtools', 'calmd') cat = pysam.utils.PysamDispatcher('samtools', 'cat') +checksum = pysam.utils.PysamDispatcher('samtools', 'checksum') collate = pysam.utils.PysamDispatcher('samtools', 'collate') consensus = pysam.utils.PysamDispatcher('samtools', 'consensus') coverage = pysam.utils.PysamDispatcher('samtools', 'coverage') @@ -47,7 +48,7 @@ __all__ = [ 'addreplacerg', 'ampliconclip', 'ampliconstats', - 'bam2fq', 'bamshuf', 'bedcov', 'calmd', 'cat', + 'bam2fq', 'bamshuf', 'bedcov', 'calmd', 'cat', 'checksum', 'collate', 'consensus', 'coverage', 'cram_size', 'depad', 'depth', 'dict', 'faidx', 'fasta', 'fastq', 'fixmate', 'flags', 'flagstat', 'fqidx', diff --git a/pysam/version.h b/pysam/version.h index 1fb0cff5..514fe950 100644 --- a/pysam/version.h +++ b/pysam/version.h @@ -1,5 +1,5 @@ // Version information used while compiling samtools, bcftools, and htslib -#define SAMTOOLS_VERSION "1.21 (pysam)" -#define BCFTOOLS_VERSION "1.21 (pysam)" -#define HTS_VERSION_TEXT "1.21 (pysam)" +#define SAMTOOLS_VERSION "1.22 (pysam)" +#define BCFTOOLS_VERSION "1.22 (pysam)" +#define HTS_VERSION_TEXT "1.22 (pysam)" diff --git a/pysam/version.py b/pysam/version.py index f9297e56..a4ee4844 100644 --- a/pysam/version.py +++ b/pysam/version.py @@ -1,6 +1,6 @@ # pysam versioning information __version__ = "0.23.1" -__samtools_version__ = "1.21" -__bcftools_version__ = "1.21" -__htslib_version__ = "1.21" +__samtools_version__ = "1.22" +__bcftools_version__ = "1.22" +__htslib_version__ = "1.22" diff --git a/samtools/LICENSE b/samtools/LICENSE index f096c2de..e058b43b 100644 --- a/samtools/LICENSE +++ b/samtools/LICENSE @@ -1,6 +1,6 @@ The MIT/Expat License -Copyright (C) 2008-2024 Genome Research Ltd. +Copyright (C) 2008-2025 Genome Research Ltd. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/samtools/README b/samtools/README index d681d281..7027e9bd 100644 --- a/samtools/README +++ b/samtools/README @@ -9,7 +9,7 @@ Building samtools The typical simple case of building Samtools using the HTSlib bundled within this Samtools release tarball is done as follows: - cd .../samtools-1.21 # Within the unpacked release directory + cd .../samtools-1.22 # Within the unpacked release directory ./configure make @@ -21,7 +21,7 @@ install samtools etc properly into a directory of your choosing. Building for installation using the HTSlib bundled within this Samtools release tarball, and building the various HTSlib utilities such as bgzip is done as follows: - cd .../samtools-1.21 # Within the unpacked release directory + cd .../samtools-1.22 # Within the unpacked release directory ./configure --prefix=/path/to/location make all all-htslib make install install-htslib @@ -48,7 +48,7 @@ There are two advantages to this: To build with plug-ins, you need to use the --enable-plugins configure option as follows: - cd .../samtools-1.21 # Within the unpacked release directory + cd .../samtools-1.22 # Within the unpacked release directory ./configure --enable-plugins --prefix=/path/to/location make all all-htslib make install install-htslib @@ -66,8 +66,8 @@ Setting --with-plugin-path is useful if you want to run directly from the source distribution instead of installing the package. In that case you can use: - cd .../samtools-1.21 # Within the unpacked release directory - ./configure --enable-plugins --with-plugin-path=$PWD/htslib-1.21 + cd .../samtools-1.22 # Within the unpacked release directory + ./configure --enable-plugins --with-plugin-path=$PWD/htslib-1.22 make all all-htslib It is possible to override the built-in search path using the HTS_PATH diff --git a/samtools/bam2depth.c b/samtools/bam2depth.c index c3e4f04d..3f877931 100644 --- a/samtools/bam2depth.c +++ b/samtools/bam2depth.c @@ -402,7 +402,7 @@ static int add_depth(depth_opt *opt, depth_hist *dh, sam_hdr_t *h, bam1_t *b, int *hist = dh->hist[file]; k = 0; if (overlap_clip) { - if (i+oplen < overlap_clip) { + if (i+oplen <= overlap_clip) { i += oplen; break; } else if (i < overlap_clip) { @@ -430,8 +430,9 @@ static int add_depth(depth_opt *opt, depth_hist *dh, sam_hdr_t *h, bam1_t *b, int *hist = dh->hist[file]; k = 0; if (overlap_clip) { - if (i+oplen < overlap_clip) { + if (i+oplen <= overlap_clip) { i += oplen; + spos += oplen; break; } else if (i < overlap_clip) { oplen -= overlap_clip - i; diff --git a/samtools/bam2depth.c.pysam.c b/samtools/bam2depth.c.pysam.c index 1a1176c7..c238ca0a 100644 --- a/samtools/bam2depth.c.pysam.c +++ b/samtools/bam2depth.c.pysam.c @@ -404,7 +404,7 @@ static int add_depth(depth_opt *opt, depth_hist *dh, sam_hdr_t *h, bam1_t *b, int *hist = dh->hist[file]; k = 0; if (overlap_clip) { - if (i+oplen < overlap_clip) { + if (i+oplen <= overlap_clip) { i += oplen; break; } else if (i < overlap_clip) { @@ -432,8 +432,9 @@ static int add_depth(depth_opt *opt, depth_hist *dh, sam_hdr_t *h, bam1_t *b, int *hist = dh->hist[file]; k = 0; if (overlap_clip) { - if (i+oplen < overlap_clip) { + if (i+oplen <= overlap_clip) { i += oplen; + spos += oplen; break; } else if (i < overlap_clip) { oplen -= overlap_clip - i; diff --git a/samtools/bam_checksum.c b/samtools/bam_checksum.c new file mode 100644 index 00000000..17c8a3a7 --- /dev/null +++ b/samtools/bam_checksum.c @@ -0,0 +1,1322 @@ +/* bam_checksum.c -- produces checksums on SAM/BAM/CRAM/FASTA/FASTQ data + + Copyright (C) 2024 Genome Research Ltd. + + Author: James Bonfield + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. */ + +/* + * This is inspired by Biobambam's bamseqchksum tool written by + * David Jackson and amended by German Tischler. + * + * It computes order agnostic checksums for a variety of SAM fields, allowing + * validation that all the data is still present at different stages of an + * analysis pipeline. This may be useful to detect sequences which have been + * lost by an aligner, memory corruptions flipping individual sequence bases, + * or file format decoding errors. + * + * We start with something basic such as a FASTQ file, and name, seq and qual + * checksums should still all match after aligning and sorting. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include "sam_opts.h" +#include "sam_utils.h" +#include "samtools.h" + +typedef struct { + int req_flags, excl_flags; // BAM flags filtering + int flag_mask, rev_comp, in_order, sanitize; + int check_pos, check_cigar, check_mate; + char *tag_str; // X,Y,Z or "*,X,Y,Z" for negation + char *tag_free;// copy of tag_str if non-literal + char **tags; // parsed and split tag_str + int ntags; + int64_t nrec; + int verbose; // whether to show zero count lines + int show_pass; // show pass stats + int show_fail; // show fail stats + int show_combine; // show the combine column + FILE *fp; + int tabs; + int merge; // merge checksum output, rather than read BAM et al. + int compat; // compatibility with bamseqchksum format +} opts; + +/* ---------------------------------------------------------------------- + * Utility functions. Possible candidates for moving to htslib? + */ + +// Note: qual+33 is a pain, but only for the benefit of compatability with +// biobambam's bamseqchksum. It's also wrong for QUAL "*" as it triggers a +// wraparound and turning from BAM's 0xff-run to ASCII makes no sense in a +// checksum. + +#if 1 +// Nibble at a time. This could be sped up further. Eg see htslib's simd.c. +// That code ought to be expanded upon and exposed from htslib. +// +// However this is still 2.4x quicker than the naive implementation below +// It's now around 8% of CPU for a NovaSeq BAM, so some optimisation is +// possible but we're at deminishing returns. +void fill_seq_qual(opts *o, bam1_t *b, uint8_t *restrict seq_buf, + uint8_t *restrict qual_buf) { + // Tables mapping a pair of nibbles to a pair of ASCII bytes + static const char code2fwdbase[512] = + "===A=C=M=G=R=S=V=T=W=Y=H=K=D=B=N" + "A=AAACAMAGARASAVATAWAYAHAKADABAN" + "C=CACCCMCGCRCSCVCTCWCYCHCKCDCBCN" + "M=MAMCMMMGMRMSMVMTMWMYMHMKMDMBMN" + "G=GAGCGMGGGRGSGVGTGWGYGHGKGDGBGN" + "R=RARCRMRGRRRSRVRTRWRYRHRKRDRBRN" + "S=SASCSMSGSRSSSVSTSWSYSHSKSDSBSN" + "V=VAVCVMVGVRVSVVVTVWVYVHVKVDVBVN" + "T=TATCTMTGTRTSTVTTTWTYTHTKTDTBTN" + "W=WAWCWMWGWRWSWVWTWWWYWHWKWDWBWN" + "Y=YAYCYMYGYRYSYVYTYWYYYHYKYDYBYN" + "H=HAHCHMHGHRHSHVHTHWHYHHHKHDHBHN" + "K=KAKCKMKGKRKSKVKTKWKYKHKKKDKBKN" + "D=DADCDMDGDRDSDVDTDWDYDHDKDDDBDN" + "B=BABCBMBGBRBSBVBTBWBYBHBKBDBBBN" + "N=NANCNMNGNRNSNVNTNWNYNHNKNDNBNN"; + + static const char code2revbase[512] = + "==T=G=K=C=Y=S=B=A=W=R=D=M=H=V=N=" + "=TTTGTKTCTYTSTBTATWTRTDTMTHTVTNT" + "=GTGGGKGCGYGSGBGAGWGRGDGMGHGVGNG" + "=KTKGKKKCKYKSKBKAKWKRKDKMKHKVKNK" + "=CTCGCKCCCYCSCBCACWCRCDCMCHCVCNC" + "=YTYGYKYCYYYSYBYAYWYRYDYMYHYVYNY" + "=STSGSKSCSYSSSBSASWSRSDSMSHSVSNS" + "=BTBGBKBCBYBSBBBABWBRBDBMBHBVBNB" + "=ATAGAKACAYASABAAAWARADAMAHAVANA" + "=WTWGWKWCWYWSWBWAWWWRWDWMWHWVWNW" + "=RTRGRKRCRYRSRBRARWRRRDRMRHRVRNR" + "=DTDGDKDCDYDSDBDADWDRDDDMDHDVDND" + "=MTMGMKMCMYMSMBMAMWMRMDMMMHMVMNM" + "=HTHGHKHCHYHSHBHAHWHRHDHMHHHVHNH" + "=VTVGVKVCVYVSVBVAVWVRVDVMVHVVVNV" + "=NTNGNKNCNYNSNBNANWNRNDNMNHNVNNN"; + + uint8_t *seq = bam_get_seq(b); + uint8_t *qual = bam_get_qual(b); + + if ((b->core.flag & BAM_FREVERSE) && o->rev_comp) { + int i, j, len2 = b->core.l_qseq & ~1; + for (i=0, j=b->core.l_qseq-1; i < len2; i+=2, j-=2) { + memcpy(&seq_buf[j-1], &code2revbase[(size_t)seq[i>>1]*2], 2); + qual_buf[j-0] = qual[i+0]+33; + qual_buf[j-1] = qual[i+1]+33; + } + if (i < b->core.l_qseq) { + seq_buf[j] = "=TGKCYSBAWRDMHVN"[bam_seqi(seq, i)]; + qual_buf[j] = qual[i]+33; + } + } else { + int i, j, len2 = b->core.l_qseq & ~1; + for (i = j = 0; i < len2; i+=2, j++) { + // Note size_t cast helps gcc optimiser. + memcpy(&seq_buf[i], &code2fwdbase[(size_t)seq[j]*2], 2); + // Simple, but a union approach is a little faster with clang. + qual_buf[i+0] = qual[i+0]+33; + qual_buf[i+1] = qual[i+1]+33; + } + if (i < b->core.l_qseq) { + seq_buf[i] = seq_nt16_str[bam_seqi(seq, i)]; + qual_buf[i] = qual[i]+33; + } + } +} + +#else +// Simple version +void fill_seq_qual(opts *o, bam1_t *b, uint8_t *restrict seq_buf, + uint8_t *restrict qual_buf) { + uint8_t *seq = bam_get_seq(b); + uint8_t *qual = bam_get_qual(b); + + if ((b->core.flag & BAM_FREVERSE) && o->rev_comp) { + for (int i=0, j=b->core.l_qseq-1; i < b->core.l_qseq; i++,j--) { + seq_buf[j] = "=TGKCYSBAWRDMHVN"[bam_seqi(seq, i)]; + qual_buf[j] = qual[i]+33; + } + } else { + for (int i = 0; i < b->core.l_qseq; i++) { + seq_buf[i] = seq_nt16_str[bam_seqi(seq, i)]; + qual_buf[i] = qual[i]+33; + } + } +} +#endif + + +/* ---------------------------------------------------------------------- + * Checksum aggregation + */ + +/* + * The hash is multiplicative within a finite field, modulo PRIME. + * We need to avoid zeros, and the data type has to be large enough to ensure + * no wraparound happens (other than the intended modulo). + * + * A simpler version would be (hash + crc) % PRIME, but we use the + * multiplicative version to keep compatibility with biobambam2. + */ +#define PRIME ((1u<<31)-1) +uint64_t update_hash(uint64_t hash, uint32_t crc) { + crc &= PRIME; + if (crc == 0 || crc == PRIME) + crc = 1; + + return (hash * crc) % PRIME; +} + +typedef struct { + uint64_t seq[3]; // flag + seq + uint64_t name[3]; // name + flag + seq + uint64_t qual[3]; // flag + seq + qual + uint64_t aux[3]; // flag + seq + aux + uint64_t pos[3]; // flag + seq + chr/pos + uint64_t cigar[3]; // flag + seq + cigar + uint64_t mate[3]; // flag + seq + rnext/pnext/tlen + uint64_t count[3]; +} sums_t; + +typedef struct { + uint32_t seq; + uint32_t name; + uint32_t qual; + uint32_t aux; + uint32_t pos; + uint32_t cigar; + uint32_t mate; +} crcs_t; + +KHASH_MAP_INIT_STR(chk, sums_t) + +// Initialise the sums. To 1 as we're multiplying and 0 is banned. +// (Except count which is literally just a counter) +void sums_init(sums_t *h32) { + for (int i = 0; i < 3; i++) { + h32->seq[i] = 1; + h32->name[i] = 1; + h32->qual[i] = 1; + h32->aux[i] = 1; + h32->pos[i] = 1; + h32->cigar[i] = 1; + h32->mate[i] = 1; + h32->count[i] = 0; + } +} + +// Updates a single row in the checksum output +void sums_update_row(int row, sums_t *h32, const crcs_t *c, + uint32_t count_crc, uint64_t n) { + h32->seq[row] = update_hash(h32->seq[row], count_crc ^ c->seq); + h32->name[row] = update_hash(h32->name[row], count_crc ^ c->name); + h32->qual[row] = update_hash(h32->qual[row], count_crc ^ c->qual); + h32->aux[row] = update_hash(h32->aux[row], count_crc ^ c->aux); + h32->pos[row] = update_hash(h32->pos[row], count_crc ^ c->pos); + h32->cigar[row]= update_hash(h32->cigar[row],count_crc ^ c->cigar); + h32->mate[row] = update_hash(h32->mate[row], count_crc ^ c->mate); + h32->count[row] += n; +} + +// Updates a single group, with all/pass or all/fail rows. Also handles the +// in_order modes. +void sums_update(int qcfail, sums_t *h32, const crcs_t *crcs, opts *o, + uint64_t count) { + uint32_t count_crc = 0; + if (o->in_order) { + uint8_t c[8]; + u64_to_le(o->in_order == 1 ? count : h32->count[0], c); + count_crc = hts_crc32(0, c, 8); + } + + sums_update_row(0, h32, crcs, count_crc, 1); + if (o->show_pass && !qcfail) + sums_update_row(1, h32, crcs, count_crc, 1); + if (o->show_fail && qcfail) + sums_update_row(2, h32, crcs, count_crc, 1); +} + +// Report single group (all, pass, fail) +void sums_report(opts *o, sums_t *h32, const char *set) { + for (int r = 0; r <= 2; r++) { + uint64_t hc = 1; + char *pass[] = {"all", "pass", "fail"}; + + if (r == 1 && !o->show_pass) + continue; + if (r == 2 && !o->show_fail) + continue; + + if (!o->verbose && !h32->count[r]) + continue; + + if (o->tabs) { + fprintf(o->fp, "%s\t%s\t%"PRIu64"\t%s%"PRIx64"\t%"PRIx64 + "\t%"PRIx64"\t%"PRIx64, set, pass[r], h32->count[r], + o->compat ? "\t" : "", + h32->seq[r], h32->name[r], h32->qual[r], h32->aux[r]); + if (o->check_pos) + fprintf(o->fp, "\t%"PRIx64, h32->pos[r]); + if (o->check_cigar) + fprintf(o->fp, "\t%"PRIx64, h32->cigar[r]); + if (o->check_mate) + fprintf(o->fp, "\t%"PRIx64, h32->mate[r]); + } else { + fprintf(o->fp, "%-10s %-4s %12"PRIu64" %08"PRIx64" %08"PRIx64 + " %08"PRIx64" %08"PRIx64, set, pass[r], h32->count[r], + h32->seq[r], h32->name[r], h32->qual[r], h32->aux[r]); + if (o->check_pos) + fprintf(o->fp, " %08"PRIx64, h32->pos[r]); + if (o->check_cigar) + fprintf(o->fp, " %08"PRIx64, h32->cigar[r]); + if (o->check_mate) + fprintf(o->fp, " %08"PRIx64, h32->mate[r]); + } + + // Merge all + hc = update_hash(hc, h32->count[r]>>32); + hc = update_hash(hc, h32->count[r] & 0xffffffff); + hc = update_hash(hc, h32->seq[r]); + hc = update_hash(hc, h32->name[r]); + hc = update_hash(hc, h32->seq[r]); + hc = update_hash(hc, h32->aux[r]); + if (o->check_pos) + hc = update_hash(hc, h32->pos[r]); + if (o->check_cigar) + hc = update_hash(hc, h32->cigar[r]); + if (o->check_mate) + hc = update_hash(hc, h32->mate[r]); + + if (o->show_combine) { + if (o->tabs) + fprintf(o->fp, "\t%"PRIx64"\n", hc); + else + fprintf(o->fp, " %08"PRIx64"\n", hc); + } else { + fprintf(o->fp, "\n"); + } + } +} + +/* ---------------------------------------------------------------------- + * Main checksumming algorithm + */ + +/* + * Canonicalised integer tags. + * We can store CcSsIi for unsigned and signed char, short and integer. + * (This can also happen for B arrays, but we don't yet canonicalise these.) + * + * Unfortunately some BAMs have degenerate encs, eg XAs\000\001 for XA:s:1. + * Also CRAM's computed NM can change, so NM:i:0 could be NMc0 or NMC0. + * + * Rules: unsigned if >= 0 + * smallest encoding necessary + * + * Returns a tag pointer (possibly local static, or original ptr), + * plus rewrites *tag_len if needed. + */ +uint8_t *canonical_tag(uint8_t *tag, size_t *tag_len) { + switch (tag[2]) { + static uint8_t ct[7], code; + int64_t val; + + case 'C': case 'c': + case 'S': case 's': + case 'I': case 'i': + val = bam_aux2i(tag+2); + if (val >= 0) { + if (val <= 255) code = 'C'; + else if (val <= 65535) code = 'S'; + else code = 'I'; + } else { + if (val >= -128 && val <= 127) code = 'c'; + else if (val >= -32768 && val <= 32767) code = 's'; + else code = 'i'; + } + if (code == tag[2]) + // Already optimal. The usual code path + return tag; + + // Otherwise rewrite it; + ct[0] = tag[0]; + ct[1] = tag[1]; + ct[2] = code; + switch (code) { + case 'C': case 'c': + ct[3] = val; + *tag_len = 4; + break; + + case 'S': case 's': + // Don't care about sign as it's defined anyway + u16_to_le(val, ct+3); + *tag_len = 5; + break; + + case 'I': case 'i': + // Don't care about sign as it's defined anyway + u32_to_le(val, ct+3); + *tag_len = 7; + break; + } + return ct; + + default: + return tag; + } +} + +// Qsort callback, by integer +static int tag_qsort(const void *t1, const void *t2) { + return *(const int *)t1 - *(const int *)t2; +} + +/* + * Produces a concatenated string of aux tags in binary + * representation, with the tag names and orders defined in tag_ids[], + * checksums it, and combines it with the flag-seq CRC. + * If *tag_str is "*" then we negate tag_ids and encode everything but those. + * This is a bit trickier as we can no longer use the order specified and + * instead encode in ASCII sorted order instead. + * + * If the read-group is found in the RG:Z: aux, this is returned in + * the *RGZ ptr (which points to the field. + * + * Returns 0 on success, updating *crc_aux, + * -1 on error + */ +int hash_aux(bam1_t *b, kstring_t *ks, int ntags, + char **tag_ids, + uint8_t **tag_ptr, size_t *tag_len, + const char *tag_str, short (*tag_keep)[75], + uint32_t crc_seq, uint32_t *crc_aux, + uint8_t **RGZ) { + size_t aux_len = bam_get_l_aux(b); + // 1 byte minimum forces a non-NULL pointer so CRC works + if (ks_resize(ks, aux_len+1) < 0) + return -1; + uint8_t *aux_ptr = (uint8_t *)ks->s; + + // Pass 1: find all tags to copy and their lengths + uint8_t *aux = bam_aux_first(b), *aux_next; + memset(tag_len, 0, ntags * sizeof(*tag_len)); + int tag_id[4000]; // a-zA-Z0-9 is 62. 62^2 is 3844 + + if (*tag_str == '*') { + // All tags bar specific ones, in alphanumeric order. + // Select the tags by name on pass 1, then sort by name to get + // a canonical order, and finally concatenate tags in order. + ntags = 0; + while (aux) { + if (aux[-2] == 'R' && aux[-1] == 'G' && aux[0] == 'Z' && RGZ) + *RGZ = aux+1; + aux_next = bam_aux_next(b, aux); + if (!(aux[-2] >= '0' && aux[-2] <= 'z' && + aux[-1] >= '0' && aux[-1] <= 'z')) { + aux = aux_next; + continue; // skip illegal tag names + } + if (tag_keep[aux[-2]-'0'][aux[-1]-'0'] == 0) { + size_t tag_sz = aux_next + ? aux_next - aux + : b->data + b->l_data - aux + 2; + tag_id[ntags] = (aux[-2]<<24) | (aux[-1]<<16) | ntags; + tag_ptr[ntags] = aux-2; + tag_len[ntags] = tag_sz; + if (++ntags >= 4000) + return -1; + } + + aux = aux_next; + } + + // Sort + qsort(tag_id, ntags, sizeof(*tag_id), tag_qsort); + + // Now we have tag_ptr2 in order of occurrence and tag_id in + // lexicalgraphical order. Stitch together + for (int i = 0; i < ntags; i++) { + int orig_pos = tag_id[i]&0xffff; + size_t len = tag_len[orig_pos]; + uint8_t *tag = canonical_tag(tag_ptr[orig_pos], &len); + memcpy(aux_ptr, tag, len); + aux_ptr += len; + } + + } else { + // Selected tags only, in the order requested + while (aux) { + if (aux[-2] == 'R' && aux[-1] == 'G' && aux[0] == 'Z' && RGZ) + *RGZ = aux+1; + aux_next = bam_aux_next(b, aux); + if (!(aux[-2] >= '0' && aux[-2] <= 'z' && + aux[-1] >= '0' && aux[-1] <= 'z')) + continue; // skip illegal tag names + int i = tag_keep[aux[-2]-'0'][aux[-1]-'0']-1; + if (i>=0) { + // found one + size_t tag_sz = aux_next + ? aux_next - aux + : b->data + b->l_data - aux + 2; + + tag_ptr[i] = aux-2; + tag_len[i] = tag_sz; + } + + aux = aux_next; + } + + // Pass 2: copy tags in the order we requested + for (int i = 0; i < ntags; i++) { + if (tag_len[i]) { + size_t len = tag_len[i]; + uint8_t *tag = canonical_tag(tag_ptr[i], &len); + memcpy(aux_ptr, tag, len); + aux_ptr += len; + } + } + } + + //write(3, (uint8_t *)ks->s, aux_ptr - (uint8_t *)ks->s); + *crc_aux = hts_crc32(crc_seq, ks->s, aux_ptr - (uint8_t *)ks->s); + + return 0; +} + +// Qsort callback, by kh_key(h,idx). +// Needs a global due to the rubbish interface of qsort, but that's fine +// as we're not multi-threaded. +static khash_t(chk) *key_qsort_h = NULL; +static int key_qsort(const void *t1, const void *t2) { + return strcmp(kh_key(key_qsort_h, *(const khiter_t *)t1), + kh_key(key_qsort_h, *(const khiter_t *)t2)); +} + +// Compatibility with biobambam2's bamseqchksum output format +int checksum_bamseqchksum(opts *o, sums_t *all, sums_t *noRG, khash_t(chk) *h){ + // Why two tabs after count? + fprintf(o->fp, "###\tset\tcount\t\tb_seq\tname_b_seq\tb_seq_qual\tb_seq_tags(BC,FI,QT,RT,TC)\n"); + + o->tabs = 1; + o->show_pass = 1; + o->verbose = 1; + o->show_combine = 0; + sums_report(o, all, "all"); + sums_report(o, noRG, ""); + + // Per read-group line + int nrgs = 0; + khiter_t *rgs = malloc(kh_size(h) * sizeof(*rgs)); + if (!rgs) + return -1; + + for (khiter_t k = kh_begin(h); k != kh_end(h); k++) + if (kh_exist(h, k)) + rgs[nrgs++] = k; + + key_qsort_h = h; // Use a global to avoid extra hash lookups here + qsort(rgs, nrgs, sizeof(*rgs), key_qsort); + for (int k = 0; k < nrgs; k++) + sums_report(o, &kh_value(h, rgs[k]), kh_key(h, rgs[k])); + + free(rgs); + + return 0; +} + +int checksum_report(char *fn, opts *o, + sums_t *all, sums_t *noRG, khash_t(chk) *h) { + if (o->compat) + return checksum_bamseqchksum(o, all, noRG, h); + + // headers + fprintf(o->fp, "# Checksum 1.0 for file:%s%s\n", + o->tabs ? "\t" : " ", fn); + fprintf(o->fp, "# Aux tags:%s%s\n", + o->tabs ? "\t" : " ", o->tag_str); + char *s=bam_flag2str(o->flag_mask); + if (!s) + return -1; + fprintf(o->fp, "# BAM flags:%s%s\n", + o->tabs ? "\t" : " ", s); + free(s); + if (o->tabs) + fprintf(o->fp, "\n# Group\tQC\tcount\tflag+seq\t+name\t+qual\t+aux"); + else + fprintf(o->fp, "\n# Group QC count flag+seq +name" + " +qual +aux "); + if (o->check_pos) + fprintf(o->fp, o->tabs ? "\t+chr/pos" : " +chr/pos"); + if (o->check_cigar) + fprintf(o->fp, o->tabs ? "\t+cigar" : " +cigar "); + if (o->check_mate) + fprintf(o->fp, o->tabs ? "\t+mate" : " +mate "); + fprintf(o->fp, o->tabs ? "\tcombined\n" : " combined\n"); + + // All and "-" (no RG) lines + sums_report(o, all, "all"); + if (o->verbose || (noRG->count[0] + noRG->count[1])) + sums_report(o, noRG, "-"); + + // Per read-group line + int nrgs = 0; + khiter_t *rgs = malloc(kh_size(h) * sizeof(*rgs)); + if (!rgs) + return -1; + + for (khiter_t k = kh_begin(h); k != kh_end(h); k++) + if (kh_exist(h, k)) + rgs[nrgs++] = k; + + key_qsort_h = h; // Use a global to avoid extra hash lookups here + qsort(rgs, nrgs, sizeof(*rgs), key_qsort); + for (int k = 0; k < nrgs; k++) + sums_report(o, &kh_value(h, rgs[k]), kh_key(h, rgs[k])); + + free(rgs); + + return 0; +} + +int checksum(sam_global_args *ga, opts *o, char *fn) { + samFile *fp = NULL; + sam_hdr_t *hdr = NULL; + bam1_t *b = bam_init1(); + char **tags = o->tags; + int ntags = o->ntags; + uint8_t **tag_ptr = calloc(65536, sizeof(*tag_ptr)); + size_t *tag_len = calloc(65536, sizeof(*tag_len)); + kstring_t aux_ks = KS_INITIALIZE; + kstring_t seq_ks = KS_INITIALIZE; + kstring_t qual_ks = KS_INITIALIZE; + khash_t(chk) *h = kh_init(chk); + int ret = -1; + int64_t nrec = o->nrec; + + if (!b || !tag_ptr || !tag_len || !h) + goto err; + +//#undef HTS_LITTLE_ENDIAN // uncomment this to validate / debug + +#ifndef HTS_LITTLE_ENDIAN + kstring_t cigar_ks = KS_INITIALIZE; +#endif + + // A precomputed lookup table to speed up selection of tags + short tag_keep[75][75] = {0}; // 'z' is 122, '0' is 48. 122-48+1 == 75 + for (int i = 0; i < ntags; i++) { + char *t = tags[i]; + if (t[0] != '*' && + !(t[0] >= '0' && t[0] <= 'z' && + t[1] >= '0' && t[1] <= 'z')) { + fprintf(stderr, "[checksum] Illegal tag ID '%.2s'\n", t); + goto err; + } + if (t[0] != '*') + tag_keep[t[0]-'0'][t[1]-'0'] = i+1; + } + + sums_t h32, noRG; + sums_init(&h32); + sums_init(&noRG); + uint32_t crc32_start = hts_crc32(0, NULL, 0); + + fp = sam_open_format(fn, "r", &ga->in); + if (!fp) { + print_error_errno("checksum", "Cannot open input file \"%s\"", fn); + goto err; + } + + if (ga->nthreads > 0) + hts_set_threads(fp, ga->nthreads); + + if (!(hdr = sam_hdr_read(fp))) + goto err; + + int r; + while ((r = sam_read1(fp, hdr, b)) >= 0) { + crcs_t c; + + if (b->core.flag & o->excl_flags) + continue; + + if ((b->core.flag & o->req_flags) != o->req_flags) + continue; + + if (o->sanitize) + bam_sanitize(hdr, b, o->sanitize); + + // 8 bits of flag corresponding to original instrument data + uint8_t flags = b->core.flag & o->flag_mask; + + // Copy sequence out from nibble to base, and reverse complement + // seq / qual if required. Qual is +33 (ASCII format) only for + // compatibility with biobambam's bamseqchksum tool. + // The +1 here and elsewhere is to force zero byte allocations to + // always return a pointer rather than NULL. This in turn prevents + // crc32() from considering it as a reinitialisation. + if (ks_resize(&seq_ks, b->core.l_qseq+1) < 0 || + ks_resize(&qual_ks, b->core.l_qseq+1) < 0) + goto err; + + fill_seq_qual(o, b, (uint8_t *)seq_ks.s, (uint8_t *)qual_ks.s); + + // flag + seq + uint32_t crc = hts_crc32(crc32_start, &flags, 1); + c.seq = hts_crc32(crc, seq_ks.s, b->core.l_qseq); + + // name + flag + seq. + // flag + seq + name would be faster, but bamseqchksum does this. + // Also include single nul for compatibility too. + crc = hts_crc32(crc32_start, bam_get_qname(b), + b->core.l_qname - b->core.l_extranul); + crc = hts_crc32(crc, &flags, 1); + c.name = hts_crc32(crc, seq_ks.s, b->core.l_qseq); + + // flag + seq + qual + c.qual = hts_crc32(c.seq, qual_ks.s, b->core.l_qseq); + + // flag + seq + aux tags + uint8_t *RGZ = NULL; + if (hash_aux(b, &aux_ks, ntags, tags, tag_ptr, tag_len, + o->tag_str, tag_keep, c.seq, &c.aux, &RGZ) < 0) + goto err; + + // flag + seq + chr + pos + if (o->check_pos) { + uint8_t chr_pos[4+8]; + u32_to_le(b->core.tid, chr_pos); + u64_to_le(b->core.pos, chr_pos+4); + c.pos = hts_crc32(c.seq, chr_pos, 12); + } + + // flag + seq + rnext + pnext + tlen + if (o->check_mate) { + uint8_t mate[4+8+8]; + u32_to_le(b->core.mtid, mate); + u64_to_le(b->core.mpos, mate+4); + u64_to_le(b->core.isize, mate+12); + c.mate = hts_crc32(c.seq, mate, 12); + } + + // flag + seq + mapq + cigar + if (o->check_cigar) { + uint8_t *cigar = (uint8_t *)bam_get_cigar(b); +#ifndef HTS_LITTLE_ENDIAN + if (ks_resize(&cigar_ks, 4 * b->core.n_cigar+1) < 0) + goto err; + uint32_t *cig32 = bam_get_cigar(b); + cigar = (uint8_t *)cigar_ks.s; + + for (int i = 0; i < b->core.n_cigar; i++) + u32_to_le(cig32[i], cigar + 4*i); +#endif + uint8_t mapq[4]; + u32_to_le(b->core.qual, mapq); + c.cigar = hts_crc32(c.seq, mapq, 4); + c.cigar = hts_crc32(c.cigar, cigar, 4 * b->core.n_cigar); + } + + // Aggregate checksum hashes + uint64_t count = h32.count[0]; + if (RGZ) { + sums_t *h32p; + + // create func + int kret; + khiter_t k = kh_get(chk, h, (char *)RGZ); + if (k == kh_end(h)) { + char *rgz_ = strdup((char *)RGZ); + if (!rgz_) + goto err; + k = kh_put(chk, h, rgz_, &kret); + if (kret < 0) { + free(rgz_); + goto err; + } + sums_init(&kh_value(h, k)); + } + h32p = &kh_value(h, k); + + count = h32p->count[0]; + sums_update(b->core.flag & BAM_FQCFAIL, h32p, &c, o, count); + } else { + count = noRG.count[0]; + sums_update(b->core.flag & BAM_FQCFAIL, &noRG, &c, o, count); + } + + sums_update(b->core.flag & BAM_FQCFAIL, &h32, &c, o, count); + + if (nrec && --nrec == 0) + break; + } + + if (r < -1) + goto err; + + if (sam_close(fp) < 0) { + fp = NULL; + print_error_errno("checksum", "Closing input file \"%s\"", fn); + goto err; + } + fp = NULL; + + // Report hashes + if (checksum_report(fn, o, &h32, &noRG, h) < 0) + goto err; + + ret = 0; + err: + if (b) bam_destroy1(b); + if (hdr) sam_hdr_destroy(hdr); + if (fp) sam_close(fp); + + free(tag_ptr); + free(tag_len); + ks_free(&aux_ks); + ks_free(&seq_ks); + ks_free(&qual_ks); +#ifndef HTS_LITTLE_ENDIAN + ks_free(&cigar_ks); +#endif + + if (h) { + for (khiter_t k = kh_begin(h); k != kh_end(h); k++) { + if (!kh_exist(h, k)) + continue; + + free((char *)kh_key(h, k)); + } + kh_destroy(chk, h); + } + + return ret; +} + +/* ---------------------------------------------------------------------- + * Checksum combining. This is used to merge multiple checksum output files + * from e.g. "samtools split" readgroup files, into a single combined + * checksum to give the same result as doing a samtools merge | checksum. + */ + +// Process an individual file, aggregating to s, noRG and h +static int sums_parse(opts *o, char *fn, sums_t *sums, sums_t *noRG, + khash_t(chk) *h) { + int ret = -1; + FILE *fp; + if ((fp = fopen(fn, "r")) == NULL) { + perror(fn); + return -1; + } + + kstring_t line = KS_INITIALIZE; + int nheader = 0; + enum { + H_GROUP, H_QC, H_COUNT, H_SEQ, H_NAME, H_QUAL, H_AUX, + H_POS, H_CIGAR, H_MATE, H_COMBINED + } header[11] = {-1,-1,-1,-1,-1, -1,-1,-1,-1,-1, -1}; + crcs_t crcs = {1,1,1,1,1,1,1}; + + while (line.l = 0, kgetline(&line, (kgets_func *)fgets, fp) >= 0) { + if (strncmp(line.s, "# Checksum", 10) == 0) { + int major, minor; + if (sscanf(line.s, "# Checksum %d.%d", &major, &minor) == 2) { + if (major != 1 || minor != 0) { + fprintf(stderr, "Unsupported checksum output version\n"); + goto err; + } + } + continue; + } + + if (strncmp(line.s, "# Group", 7) == 0) { + // Parse column header so we know which fields are present + int n, i = 0, idx; + char *ptr = line.s+2; + char token[20]; + while ((n = sscanf(ptr, "%19s%n", token, &idx)) == 1) { + if (strcmp(token, "Group") == 0) + header[i] = H_GROUP; + else if (strcmp(token, "QC") == 0) + header[i] = H_QC; + else if (strcmp(token, "count") == 0) + header[i] = H_COUNT; + else if (strcmp(token, "flag+seq") == 0) + header[i] = H_SEQ; + else if (strcmp(token, "+name") == 0) + header[i] = H_NAME; + else if (strcmp(token, "+qual") == 0) + header[i] = H_QUAL; + else if (strcmp(token, "+aux") == 0) + header[i] = H_AUX; + else if (strcmp(token, "+chr/pos") == 0) + header[i] = H_POS, o->check_pos = 1; + else if (strcmp(token, "+cigar") == 0) + header[i] = H_CIGAR, o->check_cigar = 1; + else if (strcmp(token, "+mate") == 0) + header[i] = H_MATE, o->check_mate = 1; + else if (strcmp(token, "combined") == 0) + header[i] = H_COMBINED; + else { + fprintf(stderr, "Unrecognised header token '%s'\n", token); + goto err; + } + + i++; + ptr += idx; + } + nheader = i; + + continue; + } + + if (strncmp(line.s, "# Aux", 5) == 0) { + int idx; + char c; + if (sscanf(line.s, "# Aux tags: %c%n", &c, &idx) == 1) + if (!o->tag_str) + o->tag_free = o->tag_str = strdup(line.s + idx-1); + + continue; + } + + if (strncmp(line.s, "# BAM", 5) == 0) { + int idx; + char c; + if (sscanf(line.s, "# BAM flags: %c%n", &c, &idx) == 1) + o->flag_mask = bam_str2flag(line.s + idx-1); + + continue; + } + + if (!line.l || *line.s == '#') + continue; + + + // Header done. Now parse the data lines + if (strncmp(line.s, "all ", 4) == 0 || + strncmp(line.s, "all\t", 4) == 0) + continue; + + char col[11][128], *ptr = line.s; + int nf; + for (nf = 0; nf < 11; nf++) { + int idx; + int n = sscanf(ptr, "%127s%n", col[nf], &idx); + if (n <= 0) + break; + if (strlen(col[nf]) == 127) { + fprintf(stderr, "Field too long\n"); + goto err; + } + ptr += idx; + } + + // Sanity check that header and rows match + if (nf < 8 || nf != nheader) { + fprintf(stderr, "Incorrect number of columns in line: %s\n", + line.s); + goto err; + } + + // Marry up column header with row entries and set struct. + // (We could update the struct to be numbered instead of + // named in variables to make this easier.) + int qc = 0; + uint64_t count = 0; + for (int i = 0; i < nf; i++) { + switch (header[i]) { + case H_QC: + if (strcmp(col[i], "all") == 0) + qc = 0; + else if (strcmp(col[i], "pass") == 0) + qc = 1; + else if (strcmp(col[i], "fail") == 0) + qc = 2; + else + goto err; + break; + + case H_COUNT: + count = strtoull(col[i], NULL, 10); + break; + + case H_SEQ: + crcs.seq = strtoul(col[i], NULL, 16); + break; + + case H_NAME: + crcs.name = strtoul(col[i], NULL, 16); + break; + + case H_QUAL: + crcs.qual = strtoul(col[i], NULL, 16); + break; + + case H_AUX: + crcs.aux = strtoul(col[i], NULL, 16); + break; + + case H_POS: + crcs.pos = strtoul(col[i], NULL, 16); + break; + + case H_CIGAR: + crcs.cigar = strtoul(col[i], NULL, 16); + break; + + case H_MATE: + crcs.mate = strtoul(col[i], NULL, 16); + break; + + default: + break; + } + } + + // Add group entry + if (strcmp(col[0], "-") == 0) { + sums_update_row(qc, noRG, &crcs, 0, count); + } else { + int kret; + khiter_t k = kh_get(chk, h, col[0]); + if (k == kh_end(h)) { + char *rgz_ = strdup(col[0]); + if (!rgz_) + goto err; + k = kh_put(chk, h, rgz_, &kret); + if (kret < 0) { + free(rgz_); + goto err; + } + sums_init(&kh_value(h, k)); + } + sums_update_row(qc, &kh_value(h, k), &crcs, 0, count); + } + + // Add to global "all" stats + sums_update_row(qc, sums, &crcs, 0, count); + } + + ret = 0; + + err: + ks_free(&line); + fclose(fp); + return ret; +} + +// Combine multiple checksum files together and report the merged stats +int combine(opts *o, int argc, char **argv) { + int ret = -1; + sums_t s, noRG; + sums_init(&s); + sums_init(&noRG); + + free(o->tag_free); // Probably NULL, but just incase + o->tag_free = o->tag_str = NULL; + khash_t(chk) *h = kh_init(chk); + if (!h) + goto err; + for (int i = 0; i < argc; i++) { + if (sums_parse(o, argv[i], &s, &noRG, h) < 0) { + fprintf(stderr, "Failed to parse checksum file '%s'\n", argv[i]); + goto err; + } + } + checksum_report("merge", o, &s, &noRG, h); + + ret = 0; + err: + free(o->tag_free); + o->tag_free = NULL; + + if (h) { + for (khiter_t k = kh_begin(h); k != kh_end(h); k++) { + if (!kh_exist(h, k)) + continue; + + free((char *)kh_key(h, k)); + } + kh_destroy(chk, h); + } + + return ret; +} + +/* ---------------------------------------------------------------------- + * CLI + */ +void usage_exit(FILE *fp, int ret) { + fprintf(stderr, "Usage: samtools checksum [options] [file.bam ...]\n"); + fprintf(stderr, "or samtools checksum [options] -m [file.chk ...]\n\n"); + fprintf(stderr, "Options:\n\ + -F, --exclude-flags FLAG Filter if any FLAGs are present [0x900]\n\ + -f, --require-flags FLAG Filter unless all FLAGs are present [0]\n\ + -b, --flag-mask FLAG BAM FLAGs to use in checksums [0x0c1]\n\ + -c, --no-rev-comp Do not reverse-complement sequences [off]\n\ + -t, --tags STR[,STR] Select tags to checksum [BC,FI,QT,RT,TC]\n\ + -O, --in-order Use order-specific checksumming [off]\n\ + -P, --check-pos Also checksum CHR / POS [off]\n\ + -C, --check-cigar Also checksum MAPQ / CIGAR [off]\n\ + -M, --check_mate Also checksum PNEXT / RNEXT / TLEN [off]\n\ + -z, --sanitize FLAGS Perform sanity checks and fix records [off]\n\ + -N, --count INT Stop after INT number of records [0]\n\ + -o, --output FILE Write report to FILE [stdout]\n\ + -q, --show-qc Also show QC pass/fail lines\n\ + -v, --verbose Increase verbosity: show lines with 0 counts\n\ + -a, --all Check all: -PCMOc -b 0xfff -f0 -F0 -z all,cigarx\n\ + -T, --tabs Format output as tab delimited text\n\ + -m, --merge FILE Merge checksum output (-o opt) files\n\ + -B, --bamseqchksum Report in bamseqchksum format\n"); + fprintf(fp, "\nGlobal options:\n"); + sam_global_opt_help(fp, "-.---@--"); + exit(ret); +} + +int parse_tags(opts *o) { + // Count + int nt = 0; + for (char *t = o->tag_str; *t; t++) { + nt++; + char *l = t; + while (*t && *t != ',') + t++; + if (t-l != 2 && !(t-l == 1 && *l == '*')) { + fprintf(stderr, "Bad tag string. Should be XX,YY,... syntax\n"); + return 1; + } + if (!*t) + break; + } + + // Split by tag + o->ntags = nt; + o->tags = calloc(nt, sizeof(*o->tags)); + if (!o->tags) + return 1; + + nt = 0; + for (char *t = o->tag_str; *t; t++, nt++) { + o->tags[nt] = t; + while (*t && *t != ',') + t++; + if (!*t) + break; + } + + return 0; +} + +// Main command entry +int main_checksum(int argc, char **argv) { + opts opts = { + .req_flags = 0, + .excl_flags = BAM_FSECONDARY | BAM_FSUPPLEMENTARY, + .flag_mask = BAM_FPAIRED | BAM_FREAD1 | BAM_FREAD2, + .rev_comp = 1, + .tag_str = "BC,FI,QT,RT,TC", + .tag_free = NULL, + .check_pos = 0, + .check_cigar = 0, + .check_mate = 0, + .in_order = 0, + .sanitize = 0, + .nrec = 0, + .verbose = 0, + .show_pass = 0, + .show_fail = 0, + .show_combine = 1, + .fp = stdout, + .tabs = 0, + .merge = 0, + }; + + sam_global_args ga = SAM_GLOBAL_ARGS_INIT; + static const struct option lopts[] = { + SAM_OPT_GLOBAL_OPTIONS('-', 'I', '-', '-', '.', '@'), + {"exclude-flags", required_argument, NULL, 'F'}, + {"require-flags", required_argument, NULL, 'f'}, + {"flag-mask", required_argument, NULL, 'b'}, + {"tags", required_argument, NULL, 't'}, + {"no-rev-comp", no_argument, NULL, 'c'}, + {"in-order", no_argument, NULL, 'O'}, + {"check-pos", no_argument, NULL, 'P'}, + {"check-cigar", no_argument, NULL, 'C'}, + {"check-mate", no_argument, NULL, 'M'}, + {"count", required_argument, NULL, 'N'}, + {"sanitize", required_argument, NULL, 'z'}, + {"output", required_argument, NULL, 'o'}, + {"show-qc", no_argument, NULL, 'q'}, + {"verbose", no_argument, NULL, 'v'}, + {"all", no_argument, NULL, 'a'}, + {"tabs", no_argument, NULL, 'T'}, + {"merge", no_argument, NULL, 'm'}, + {"bamseqchksum", no_argument, NULL, 'B'}, + {NULL, 0, NULL, 0} + }; + + if (argc == 1 && isatty(STDIN_FILENO)) + usage_exit(stdout, EXIT_SUCCESS); + + int c; + while ((c = getopt_long(argc, argv, "@:f:F:t:cPCMOb:z:aN:vqo:TmB", + lopts, NULL)) >= 0) { + switch (c) { + case 'O': + opts.in_order++; + break; + case 'F': + if ((opts.excl_flags = bam_str2flag(optarg)) < 0) { + print_error("checksum", "could not parse flag %s", optarg); + return 1; + } + break; + case 'f': + if ((opts.req_flags = bam_str2flag(optarg)) < 0) { + print_error("checksum", "could not parse flag %s", optarg); + return 1; + } + break; + case 'b': + if ((opts.flag_mask = bam_str2flag(optarg)) < 0) { + print_error("checksum", "could not parse flag %s", optarg); + return 1; + } + break; + case 'P': + opts.check_pos = 1; + break; + case 'C': + opts.check_cigar = 1; + break; + case 'M': + opts.check_mate = 1; + break; + case 't': + opts.tag_str = optarg; + break; + case 'c': + opts.rev_comp = 0; + break; + case 'N': + opts.nrec = strtoll(optarg, NULL, 0); + break; + + case 'B': + opts.compat = 1; + opts.show_pass = 1; + break; + case 'v': + opts.verbose++; + break; + case 'q': + opts.show_pass = opts.show_fail = 1; + break; + case 'T': + opts.tabs = 1; + break; + case 'm': + opts.merge = 1; + break; + + case 'z': + if ((opts.sanitize = bam_sanitize_options(optarg)) < 0) + return 1; + break; + + case 'a': + // ALL: a shorthand for a bunch of options to checksum the entire + // file contents. TODO: we still need tag wildcards. + opts.req_flags = 0; + opts.excl_flags = 0; + opts.flag_mask = -1; + opts.rev_comp = 0; + opts.in_order = 1; + opts.check_pos = 1; + opts.check_cigar = 1; + opts.check_mate = 1; + opts.sanitize = FIX_ALL | FIX_CIGARX; + opts.tag_str = "*,cF,MD,NM"; + break; + + case 'o': + opts.fp = fopen(optarg, "w"); + if (!opts.fp) { + perror(optarg); + return 1; + } + break; + + default: + if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) + break; + /* else fall-through */ + case '?': + usage_exit(stderr, EXIT_FAILURE); + } + } + + if (!opts.tags) { + if (parse_tags(&opts) < 0) + return 1; + } + + int ret = 0; + if (opts.merge) { + ret = combine(&opts, argc - optind, argv+optind); + } else { + if (argc-optind) { + while (optind < argc) + ret |= checksum(&ga, &opts, argv[optind++]) < 0; + } else { + ret = checksum(&ga, &opts, "-") < 0; + } + } + + if (opts.fp != stdout) + ret |= fclose(opts.fp) < 0; + + free(opts.tags); + free(opts.tag_free); + + if (ret) + fprintf(stderr, "[checksum] Failed to process data\n"); + + return ret; +} diff --git a/samtools/bam_checksum.c.pysam.c b/samtools/bam_checksum.c.pysam.c new file mode 100644 index 00000000..cfebbc65 --- /dev/null +++ b/samtools/bam_checksum.c.pysam.c @@ -0,0 +1,1324 @@ +#include "samtools.pysam.h" + +/* bam_checksum.c -- produces checksums on SAM/BAM/CRAM/FASTA/FASTQ data + + Copyright (C) 2024 Genome Research Ltd. + + Author: James Bonfield + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. */ + +/* + * This is inspired by Biobambam's bamseqchksum tool written by + * David Jackson and amended by German Tischler. + * + * It computes order agnostic checksums for a variety of SAM fields, allowing + * validation that all the data is still present at different stages of an + * analysis pipeline. This may be useful to detect sequences which have been + * lost by an aligner, memory corruptions flipping individual sequence bases, + * or file format decoding errors. + * + * We start with something basic such as a FASTQ file, and name, seq and qual + * checksums should still all match after aligning and sorting. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include "sam_opts.h" +#include "sam_utils.h" +#include "samtools.h" + +typedef struct { + int req_flags, excl_flags; // BAM flags filtering + int flag_mask, rev_comp, in_order, sanitize; + int check_pos, check_cigar, check_mate; + char *tag_str; // X,Y,Z or "*,X,Y,Z" for negation + char *tag_free;// copy of tag_str if non-literal + char **tags; // parsed and split tag_str + int ntags; + int64_t nrec; + int verbose; // whether to show zero count lines + int show_pass; // show pass stats + int show_fail; // show fail stats + int show_combine; // show the combine column + FILE *fp; + int tabs; + int merge; // merge checksum output, rather than read BAM et al. + int compat; // compatibility with bamseqchksum format +} opts; + +/* ---------------------------------------------------------------------- + * Utility functions. Possible candidates for moving to htslib? + */ + +// Note: qual+33 is a pain, but only for the benefit of compatability with +// biobambam's bamseqchksum. It's also wrong for QUAL "*" as it triggers a +// wraparound and turning from BAM's 0xff-run to ASCII makes no sense in a +// checksum. + +#if 1 +// Nibble at a time. This could be sped up further. Eg see htslib's simd.c. +// That code ought to be expanded upon and exposed from htslib. +// +// However this is still 2.4x quicker than the naive implementation below +// It's now around 8% of CPU for a NovaSeq BAM, so some optimisation is +// possible but we're at deminishing returns. +void fill_seq_qual(opts *o, bam1_t *b, uint8_t *restrict seq_buf, + uint8_t *restrict qual_buf) { + // Tables mapping a pair of nibbles to a pair of ASCII bytes + static const char code2fwdbase[512] = + "===A=C=M=G=R=S=V=T=W=Y=H=K=D=B=N" + "A=AAACAMAGARASAVATAWAYAHAKADABAN" + "C=CACCCMCGCRCSCVCTCWCYCHCKCDCBCN" + "M=MAMCMMMGMRMSMVMTMWMYMHMKMDMBMN" + "G=GAGCGMGGGRGSGVGTGWGYGHGKGDGBGN" + "R=RARCRMRGRRRSRVRTRWRYRHRKRDRBRN" + "S=SASCSMSGSRSSSVSTSWSYSHSKSDSBSN" + "V=VAVCVMVGVRVSVVVTVWVYVHVKVDVBVN" + "T=TATCTMTGTRTSTVTTTWTYTHTKTDTBTN" + "W=WAWCWMWGWRWSWVWTWWWYWHWKWDWBWN" + "Y=YAYCYMYGYRYSYVYTYWYYYHYKYDYBYN" + "H=HAHCHMHGHRHSHVHTHWHYHHHKHDHBHN" + "K=KAKCKMKGKRKSKVKTKWKYKHKKKDKBKN" + "D=DADCDMDGDRDSDVDTDWDYDHDKDDDBDN" + "B=BABCBMBGBRBSBVBTBWBYBHBKBDBBBN" + "N=NANCNMNGNRNSNVNTNWNYNHNKNDNBNN"; + + static const char code2revbase[512] = + "==T=G=K=C=Y=S=B=A=W=R=D=M=H=V=N=" + "=TTTGTKTCTYTSTBTATWTRTDTMTHTVTNT" + "=GTGGGKGCGYGSGBGAGWGRGDGMGHGVGNG" + "=KTKGKKKCKYKSKBKAKWKRKDKMKHKVKNK" + "=CTCGCKCCCYCSCBCACWCRCDCMCHCVCNC" + "=YTYGYKYCYYYSYBYAYWYRYDYMYHYVYNY" + "=STSGSKSCSYSSSBSASWSRSDSMSHSVSNS" + "=BTBGBKBCBYBSBBBABWBRBDBMBHBVBNB" + "=ATAGAKACAYASABAAAWARADAMAHAVANA" + "=WTWGWKWCWYWSWBWAWWWRWDWMWHWVWNW" + "=RTRGRKRCRYRSRBRARWRRRDRMRHRVRNR" + "=DTDGDKDCDYDSDBDADWDRDDDMDHDVDND" + "=MTMGMKMCMYMSMBMAMWMRMDMMMHMVMNM" + "=HTHGHKHCHYHSHBHAHWHRHDHMHHHVHNH" + "=VTVGVKVCVYVSVBVAVWVRVDVMVHVVVNV" + "=NTNGNKNCNYNSNBNANWNRNDNMNHNVNNN"; + + uint8_t *seq = bam_get_seq(b); + uint8_t *qual = bam_get_qual(b); + + if ((b->core.flag & BAM_FREVERSE) && o->rev_comp) { + int i, j, len2 = b->core.l_qseq & ~1; + for (i=0, j=b->core.l_qseq-1; i < len2; i+=2, j-=2) { + memcpy(&seq_buf[j-1], &code2revbase[(size_t)seq[i>>1]*2], 2); + qual_buf[j-0] = qual[i+0]+33; + qual_buf[j-1] = qual[i+1]+33; + } + if (i < b->core.l_qseq) { + seq_buf[j] = "=TGKCYSBAWRDMHVN"[bam_seqi(seq, i)]; + qual_buf[j] = qual[i]+33; + } + } else { + int i, j, len2 = b->core.l_qseq & ~1; + for (i = j = 0; i < len2; i+=2, j++) { + // Note size_t cast helps gcc optimiser. + memcpy(&seq_buf[i], &code2fwdbase[(size_t)seq[j]*2], 2); + // Simple, but a union approach is a little faster with clang. + qual_buf[i+0] = qual[i+0]+33; + qual_buf[i+1] = qual[i+1]+33; + } + if (i < b->core.l_qseq) { + seq_buf[i] = seq_nt16_str[bam_seqi(seq, i)]; + qual_buf[i] = qual[i]+33; + } + } +} + +#else +// Simple version +void fill_seq_qual(opts *o, bam1_t *b, uint8_t *restrict seq_buf, + uint8_t *restrict qual_buf) { + uint8_t *seq = bam_get_seq(b); + uint8_t *qual = bam_get_qual(b); + + if ((b->core.flag & BAM_FREVERSE) && o->rev_comp) { + for (int i=0, j=b->core.l_qseq-1; i < b->core.l_qseq; i++,j--) { + seq_buf[j] = "=TGKCYSBAWRDMHVN"[bam_seqi(seq, i)]; + qual_buf[j] = qual[i]+33; + } + } else { + for (int i = 0; i < b->core.l_qseq; i++) { + seq_buf[i] = seq_nt16_str[bam_seqi(seq, i)]; + qual_buf[i] = qual[i]+33; + } + } +} +#endif + + +/* ---------------------------------------------------------------------- + * Checksum aggregation + */ + +/* + * The hash is multiplicative within a finite field, modulo PRIME. + * We need to avoid zeros, and the data type has to be large enough to ensure + * no wraparound happens (other than the intended modulo). + * + * A simpler version would be (hash + crc) % PRIME, but we use the + * multiplicative version to keep compatibility with biobambam2. + */ +#define PRIME ((1u<<31)-1) +uint64_t update_hash(uint64_t hash, uint32_t crc) { + crc &= PRIME; + if (crc == 0 || crc == PRIME) + crc = 1; + + return (hash * crc) % PRIME; +} + +typedef struct { + uint64_t seq[3]; // flag + seq + uint64_t name[3]; // name + flag + seq + uint64_t qual[3]; // flag + seq + qual + uint64_t aux[3]; // flag + seq + aux + uint64_t pos[3]; // flag + seq + chr/pos + uint64_t cigar[3]; // flag + seq + cigar + uint64_t mate[3]; // flag + seq + rnext/pnext/tlen + uint64_t count[3]; +} sums_t; + +typedef struct { + uint32_t seq; + uint32_t name; + uint32_t qual; + uint32_t aux; + uint32_t pos; + uint32_t cigar; + uint32_t mate; +} crcs_t; + +KHASH_MAP_INIT_STR(chk, sums_t) + +// Initialise the sums. To 1 as we're multiplying and 0 is banned. +// (Except count which is literally just a counter) +void sums_init(sums_t *h32) { + for (int i = 0; i < 3; i++) { + h32->seq[i] = 1; + h32->name[i] = 1; + h32->qual[i] = 1; + h32->aux[i] = 1; + h32->pos[i] = 1; + h32->cigar[i] = 1; + h32->mate[i] = 1; + h32->count[i] = 0; + } +} + +// Updates a single row in the checksum output +void sums_update_row(int row, sums_t *h32, const crcs_t *c, + uint32_t count_crc, uint64_t n) { + h32->seq[row] = update_hash(h32->seq[row], count_crc ^ c->seq); + h32->name[row] = update_hash(h32->name[row], count_crc ^ c->name); + h32->qual[row] = update_hash(h32->qual[row], count_crc ^ c->qual); + h32->aux[row] = update_hash(h32->aux[row], count_crc ^ c->aux); + h32->pos[row] = update_hash(h32->pos[row], count_crc ^ c->pos); + h32->cigar[row]= update_hash(h32->cigar[row],count_crc ^ c->cigar); + h32->mate[row] = update_hash(h32->mate[row], count_crc ^ c->mate); + h32->count[row] += n; +} + +// Updates a single group, with all/pass or all/fail rows. Also handles the +// in_order modes. +void sums_update(int qcfail, sums_t *h32, const crcs_t *crcs, opts *o, + uint64_t count) { + uint32_t count_crc = 0; + if (o->in_order) { + uint8_t c[8]; + u64_to_le(o->in_order == 1 ? count : h32->count[0], c); + count_crc = hts_crc32(0, c, 8); + } + + sums_update_row(0, h32, crcs, count_crc, 1); + if (o->show_pass && !qcfail) + sums_update_row(1, h32, crcs, count_crc, 1); + if (o->show_fail && qcfail) + sums_update_row(2, h32, crcs, count_crc, 1); +} + +// Report single group (all, pass, fail) +void sums_report(opts *o, sums_t *h32, const char *set) { + for (int r = 0; r <= 2; r++) { + uint64_t hc = 1; + char *pass[] = {"all", "pass", "fail"}; + + if (r == 1 && !o->show_pass) + continue; + if (r == 2 && !o->show_fail) + continue; + + if (!o->verbose && !h32->count[r]) + continue; + + if (o->tabs) { + fprintf(o->fp, "%s\t%s\t%"PRIu64"\t%s%"PRIx64"\t%"PRIx64 + "\t%"PRIx64"\t%"PRIx64, set, pass[r], h32->count[r], + o->compat ? "\t" : "", + h32->seq[r], h32->name[r], h32->qual[r], h32->aux[r]); + if (o->check_pos) + fprintf(o->fp, "\t%"PRIx64, h32->pos[r]); + if (o->check_cigar) + fprintf(o->fp, "\t%"PRIx64, h32->cigar[r]); + if (o->check_mate) + fprintf(o->fp, "\t%"PRIx64, h32->mate[r]); + } else { + fprintf(o->fp, "%-10s %-4s %12"PRIu64" %08"PRIx64" %08"PRIx64 + " %08"PRIx64" %08"PRIx64, set, pass[r], h32->count[r], + h32->seq[r], h32->name[r], h32->qual[r], h32->aux[r]); + if (o->check_pos) + fprintf(o->fp, " %08"PRIx64, h32->pos[r]); + if (o->check_cigar) + fprintf(o->fp, " %08"PRIx64, h32->cigar[r]); + if (o->check_mate) + fprintf(o->fp, " %08"PRIx64, h32->mate[r]); + } + + // Merge all + hc = update_hash(hc, h32->count[r]>>32); + hc = update_hash(hc, h32->count[r] & 0xffffffff); + hc = update_hash(hc, h32->seq[r]); + hc = update_hash(hc, h32->name[r]); + hc = update_hash(hc, h32->seq[r]); + hc = update_hash(hc, h32->aux[r]); + if (o->check_pos) + hc = update_hash(hc, h32->pos[r]); + if (o->check_cigar) + hc = update_hash(hc, h32->cigar[r]); + if (o->check_mate) + hc = update_hash(hc, h32->mate[r]); + + if (o->show_combine) { + if (o->tabs) + fprintf(o->fp, "\t%"PRIx64"\n", hc); + else + fprintf(o->fp, " %08"PRIx64"\n", hc); + } else { + fprintf(o->fp, "\n"); + } + } +} + +/* ---------------------------------------------------------------------- + * Main checksumming algorithm + */ + +/* + * Canonicalised integer tags. + * We can store CcSsIi for unsigned and signed char, short and integer. + * (This can also happen for B arrays, but we don't yet canonicalise these.) + * + * Unfortunately some BAMs have degenerate encs, eg XAs\000\001 for XA:s:1. + * Also CRAM's computed NM can change, so NM:i:0 could be NMc0 or NMC0. + * + * Rules: unsigned if >= 0 + * smallest encoding necessary + * + * Returns a tag pointer (possibly local static, or original ptr), + * plus rewrites *tag_len if needed. + */ +uint8_t *canonical_tag(uint8_t *tag, size_t *tag_len) { + switch (tag[2]) { + static uint8_t ct[7], code; + int64_t val; + + case 'C': case 'c': + case 'S': case 's': + case 'I': case 'i': + val = bam_aux2i(tag+2); + if (val >= 0) { + if (val <= 255) code = 'C'; + else if (val <= 65535) code = 'S'; + else code = 'I'; + } else { + if (val >= -128 && val <= 127) code = 'c'; + else if (val >= -32768 && val <= 32767) code = 's'; + else code = 'i'; + } + if (code == tag[2]) + // Already optimal. The usual code path + return tag; + + // Otherwise rewrite it; + ct[0] = tag[0]; + ct[1] = tag[1]; + ct[2] = code; + switch (code) { + case 'C': case 'c': + ct[3] = val; + *tag_len = 4; + break; + + case 'S': case 's': + // Don't care about sign as it's defined anyway + u16_to_le(val, ct+3); + *tag_len = 5; + break; + + case 'I': case 'i': + // Don't care about sign as it's defined anyway + u32_to_le(val, ct+3); + *tag_len = 7; + break; + } + return ct; + + default: + return tag; + } +} + +// Qsort callback, by integer +static int tag_qsort(const void *t1, const void *t2) { + return *(const int *)t1 - *(const int *)t2; +} + +/* + * Produces a concatenated string of aux tags in binary + * representation, with the tag names and orders defined in tag_ids[], + * checksums it, and combines it with the flag-seq CRC. + * If *tag_str is "*" then we negate tag_ids and encode everything but those. + * This is a bit trickier as we can no longer use the order specified and + * instead encode in ASCII sorted order instead. + * + * If the read-group is found in the RG:Z: aux, this is returned in + * the *RGZ ptr (which points to the field. + * + * Returns 0 on success, updating *crc_aux, + * -1 on error + */ +int hash_aux(bam1_t *b, kstring_t *ks, int ntags, + char **tag_ids, + uint8_t **tag_ptr, size_t *tag_len, + const char *tag_str, short (*tag_keep)[75], + uint32_t crc_seq, uint32_t *crc_aux, + uint8_t **RGZ) { + size_t aux_len = bam_get_l_aux(b); + // 1 byte minimum forces a non-NULL pointer so CRC works + if (ks_resize(ks, aux_len+1) < 0) + return -1; + uint8_t *aux_ptr = (uint8_t *)ks->s; + + // Pass 1: find all tags to copy and their lengths + uint8_t *aux = bam_aux_first(b), *aux_next; + memset(tag_len, 0, ntags * sizeof(*tag_len)); + int tag_id[4000]; // a-zA-Z0-9 is 62. 62^2 is 3844 + + if (*tag_str == '*') { + // All tags bar specific ones, in alphanumeric order. + // Select the tags by name on pass 1, then sort by name to get + // a canonical order, and finally concatenate tags in order. + ntags = 0; + while (aux) { + if (aux[-2] == 'R' && aux[-1] == 'G' && aux[0] == 'Z' && RGZ) + *RGZ = aux+1; + aux_next = bam_aux_next(b, aux); + if (!(aux[-2] >= '0' && aux[-2] <= 'z' && + aux[-1] >= '0' && aux[-1] <= 'z')) { + aux = aux_next; + continue; // skip illegal tag names + } + if (tag_keep[aux[-2]-'0'][aux[-1]-'0'] == 0) { + size_t tag_sz = aux_next + ? aux_next - aux + : b->data + b->l_data - aux + 2; + tag_id[ntags] = (aux[-2]<<24) | (aux[-1]<<16) | ntags; + tag_ptr[ntags] = aux-2; + tag_len[ntags] = tag_sz; + if (++ntags >= 4000) + return -1; + } + + aux = aux_next; + } + + // Sort + qsort(tag_id, ntags, sizeof(*tag_id), tag_qsort); + + // Now we have tag_ptr2 in order of occurrence and tag_id in + // lexicalgraphical order. Stitch together + for (int i = 0; i < ntags; i++) { + int orig_pos = tag_id[i]&0xffff; + size_t len = tag_len[orig_pos]; + uint8_t *tag = canonical_tag(tag_ptr[orig_pos], &len); + memcpy(aux_ptr, tag, len); + aux_ptr += len; + } + + } else { + // Selected tags only, in the order requested + while (aux) { + if (aux[-2] == 'R' && aux[-1] == 'G' && aux[0] == 'Z' && RGZ) + *RGZ = aux+1; + aux_next = bam_aux_next(b, aux); + if (!(aux[-2] >= '0' && aux[-2] <= 'z' && + aux[-1] >= '0' && aux[-1] <= 'z')) + continue; // skip illegal tag names + int i = tag_keep[aux[-2]-'0'][aux[-1]-'0']-1; + if (i>=0) { + // found one + size_t tag_sz = aux_next + ? aux_next - aux + : b->data + b->l_data - aux + 2; + + tag_ptr[i] = aux-2; + tag_len[i] = tag_sz; + } + + aux = aux_next; + } + + // Pass 2: copy tags in the order we requested + for (int i = 0; i < ntags; i++) { + if (tag_len[i]) { + size_t len = tag_len[i]; + uint8_t *tag = canonical_tag(tag_ptr[i], &len); + memcpy(aux_ptr, tag, len); + aux_ptr += len; + } + } + } + + //write(3, (uint8_t *)ks->s, aux_ptr - (uint8_t *)ks->s); + *crc_aux = hts_crc32(crc_seq, ks->s, aux_ptr - (uint8_t *)ks->s); + + return 0; +} + +// Qsort callback, by kh_key(h,idx). +// Needs a global due to the rubbish interface of qsort, but that's fine +// as we're not multi-threaded. +static khash_t(chk) *key_qsort_h = NULL; +static int key_qsort(const void *t1, const void *t2) { + return strcmp(kh_key(key_qsort_h, *(const khiter_t *)t1), + kh_key(key_qsort_h, *(const khiter_t *)t2)); +} + +// Compatibility with biobambam2's bamseqchksum output format +int checksum_bamseqchksum(opts *o, sums_t *all, sums_t *noRG, khash_t(chk) *h){ + // Why two tabs after count? + fprintf(o->fp, "###\tset\tcount\t\tb_seq\tname_b_seq\tb_seq_qual\tb_seq_tags(BC,FI,QT,RT,TC)\n"); + + o->tabs = 1; + o->show_pass = 1; + o->verbose = 1; + o->show_combine = 0; + sums_report(o, all, "all"); + sums_report(o, noRG, ""); + + // Per read-group line + int nrgs = 0; + khiter_t *rgs = malloc(kh_size(h) * sizeof(*rgs)); + if (!rgs) + return -1; + + for (khiter_t k = kh_begin(h); k != kh_end(h); k++) + if (kh_exist(h, k)) + rgs[nrgs++] = k; + + key_qsort_h = h; // Use a global to avoid extra hash lookups here + qsort(rgs, nrgs, sizeof(*rgs), key_qsort); + for (int k = 0; k < nrgs; k++) + sums_report(o, &kh_value(h, rgs[k]), kh_key(h, rgs[k])); + + free(rgs); + + return 0; +} + +int checksum_report(char *fn, opts *o, + sums_t *all, sums_t *noRG, khash_t(chk) *h) { + if (o->compat) + return checksum_bamseqchksum(o, all, noRG, h); + + // headers + fprintf(o->fp, "# Checksum 1.0 for file:%s%s\n", + o->tabs ? "\t" : " ", fn); + fprintf(o->fp, "# Aux tags:%s%s\n", + o->tabs ? "\t" : " ", o->tag_str); + char *s=bam_flag2str(o->flag_mask); + if (!s) + return -1; + fprintf(o->fp, "# BAM flags:%s%s\n", + o->tabs ? "\t" : " ", s); + free(s); + if (o->tabs) + fprintf(o->fp, "\n# Group\tQC\tcount\tflag+seq\t+name\t+qual\t+aux"); + else + fprintf(o->fp, "\n# Group QC count flag+seq +name" + " +qual +aux "); + if (o->check_pos) + fprintf(o->fp, o->tabs ? "\t+chr/pos" : " +chr/pos"); + if (o->check_cigar) + fprintf(o->fp, o->tabs ? "\t+cigar" : " +cigar "); + if (o->check_mate) + fprintf(o->fp, o->tabs ? "\t+mate" : " +mate "); + fprintf(o->fp, o->tabs ? "\tcombined\n" : " combined\n"); + + // All and "-" (no RG) lines + sums_report(o, all, "all"); + if (o->verbose || (noRG->count[0] + noRG->count[1])) + sums_report(o, noRG, "-"); + + // Per read-group line + int nrgs = 0; + khiter_t *rgs = malloc(kh_size(h) * sizeof(*rgs)); + if (!rgs) + return -1; + + for (khiter_t k = kh_begin(h); k != kh_end(h); k++) + if (kh_exist(h, k)) + rgs[nrgs++] = k; + + key_qsort_h = h; // Use a global to avoid extra hash lookups here + qsort(rgs, nrgs, sizeof(*rgs), key_qsort); + for (int k = 0; k < nrgs; k++) + sums_report(o, &kh_value(h, rgs[k]), kh_key(h, rgs[k])); + + free(rgs); + + return 0; +} + +int checksum(sam_global_args *ga, opts *o, char *fn) { + samFile *fp = NULL; + sam_hdr_t *hdr = NULL; + bam1_t *b = bam_init1(); + char **tags = o->tags; + int ntags = o->ntags; + uint8_t **tag_ptr = calloc(65536, sizeof(*tag_ptr)); + size_t *tag_len = calloc(65536, sizeof(*tag_len)); + kstring_t aux_ks = KS_INITIALIZE; + kstring_t seq_ks = KS_INITIALIZE; + kstring_t qual_ks = KS_INITIALIZE; + khash_t(chk) *h = kh_init(chk); + int ret = -1; + int64_t nrec = o->nrec; + + if (!b || !tag_ptr || !tag_len || !h) + goto err; + +//#undef HTS_LITTLE_ENDIAN // uncomment this to validate / debug + +#ifndef HTS_LITTLE_ENDIAN + kstring_t cigar_ks = KS_INITIALIZE; +#endif + + // A precomputed lookup table to speed up selection of tags + short tag_keep[75][75] = {0}; // 'z' is 122, '0' is 48. 122-48+1 == 75 + for (int i = 0; i < ntags; i++) { + char *t = tags[i]; + if (t[0] != '*' && + !(t[0] >= '0' && t[0] <= 'z' && + t[1] >= '0' && t[1] <= 'z')) { + fprintf(samtools_stderr, "[checksum] Illegal tag ID '%.2s'\n", t); + goto err; + } + if (t[0] != '*') + tag_keep[t[0]-'0'][t[1]-'0'] = i+1; + } + + sums_t h32, noRG; + sums_init(&h32); + sums_init(&noRG); + uint32_t crc32_start = hts_crc32(0, NULL, 0); + + fp = sam_open_format(fn, "r", &ga->in); + if (!fp) { + print_error_errno("checksum", "Cannot open input file \"%s\"", fn); + goto err; + } + + if (ga->nthreads > 0) + hts_set_threads(fp, ga->nthreads); + + if (!(hdr = sam_hdr_read(fp))) + goto err; + + int r; + while ((r = sam_read1(fp, hdr, b)) >= 0) { + crcs_t c; + + if (b->core.flag & o->excl_flags) + continue; + + if ((b->core.flag & o->req_flags) != o->req_flags) + continue; + + if (o->sanitize) + bam_sanitize(hdr, b, o->sanitize); + + // 8 bits of flag corresponding to original instrument data + uint8_t flags = b->core.flag & o->flag_mask; + + // Copy sequence out from nibble to base, and reverse complement + // seq / qual if required. Qual is +33 (ASCII format) only for + // compatibility with biobambam's bamseqchksum tool. + // The +1 here and elsewhere is to force zero byte allocations to + // always return a pointer rather than NULL. This in turn prevents + // crc32() from considering it as a reinitialisation. + if (ks_resize(&seq_ks, b->core.l_qseq+1) < 0 || + ks_resize(&qual_ks, b->core.l_qseq+1) < 0) + goto err; + + fill_seq_qual(o, b, (uint8_t *)seq_ks.s, (uint8_t *)qual_ks.s); + + // flag + seq + uint32_t crc = hts_crc32(crc32_start, &flags, 1); + c.seq = hts_crc32(crc, seq_ks.s, b->core.l_qseq); + + // name + flag + seq. + // flag + seq + name would be faster, but bamseqchksum does this. + // Also include single nul for compatibility too. + crc = hts_crc32(crc32_start, bam_get_qname(b), + b->core.l_qname - b->core.l_extranul); + crc = hts_crc32(crc, &flags, 1); + c.name = hts_crc32(crc, seq_ks.s, b->core.l_qseq); + + // flag + seq + qual + c.qual = hts_crc32(c.seq, qual_ks.s, b->core.l_qseq); + + // flag + seq + aux tags + uint8_t *RGZ = NULL; + if (hash_aux(b, &aux_ks, ntags, tags, tag_ptr, tag_len, + o->tag_str, tag_keep, c.seq, &c.aux, &RGZ) < 0) + goto err; + + // flag + seq + chr + pos + if (o->check_pos) { + uint8_t chr_pos[4+8]; + u32_to_le(b->core.tid, chr_pos); + u64_to_le(b->core.pos, chr_pos+4); + c.pos = hts_crc32(c.seq, chr_pos, 12); + } + + // flag + seq + rnext + pnext + tlen + if (o->check_mate) { + uint8_t mate[4+8+8]; + u32_to_le(b->core.mtid, mate); + u64_to_le(b->core.mpos, mate+4); + u64_to_le(b->core.isize, mate+12); + c.mate = hts_crc32(c.seq, mate, 12); + } + + // flag + seq + mapq + cigar + if (o->check_cigar) { + uint8_t *cigar = (uint8_t *)bam_get_cigar(b); +#ifndef HTS_LITTLE_ENDIAN + if (ks_resize(&cigar_ks, 4 * b->core.n_cigar+1) < 0) + goto err; + uint32_t *cig32 = bam_get_cigar(b); + cigar = (uint8_t *)cigar_ks.s; + + for (int i = 0; i < b->core.n_cigar; i++) + u32_to_le(cig32[i], cigar + 4*i); +#endif + uint8_t mapq[4]; + u32_to_le(b->core.qual, mapq); + c.cigar = hts_crc32(c.seq, mapq, 4); + c.cigar = hts_crc32(c.cigar, cigar, 4 * b->core.n_cigar); + } + + // Aggregate checksum hashes + uint64_t count = h32.count[0]; + if (RGZ) { + sums_t *h32p; + + // create func + int kret; + khiter_t k = kh_get(chk, h, (char *)RGZ); + if (k == kh_end(h)) { + char *rgz_ = strdup((char *)RGZ); + if (!rgz_) + goto err; + k = kh_put(chk, h, rgz_, &kret); + if (kret < 0) { + free(rgz_); + goto err; + } + sums_init(&kh_value(h, k)); + } + h32p = &kh_value(h, k); + + count = h32p->count[0]; + sums_update(b->core.flag & BAM_FQCFAIL, h32p, &c, o, count); + } else { + count = noRG.count[0]; + sums_update(b->core.flag & BAM_FQCFAIL, &noRG, &c, o, count); + } + + sums_update(b->core.flag & BAM_FQCFAIL, &h32, &c, o, count); + + if (nrec && --nrec == 0) + break; + } + + if (r < -1) + goto err; + + if (sam_close(fp) < 0) { + fp = NULL; + print_error_errno("checksum", "Closing input file \"%s\"", fn); + goto err; + } + fp = NULL; + + // Report hashes + if (checksum_report(fn, o, &h32, &noRG, h) < 0) + goto err; + + ret = 0; + err: + if (b) bam_destroy1(b); + if (hdr) sam_hdr_destroy(hdr); + if (fp) sam_close(fp); + + free(tag_ptr); + free(tag_len); + ks_free(&aux_ks); + ks_free(&seq_ks); + ks_free(&qual_ks); +#ifndef HTS_LITTLE_ENDIAN + ks_free(&cigar_ks); +#endif + + if (h) { + for (khiter_t k = kh_begin(h); k != kh_end(h); k++) { + if (!kh_exist(h, k)) + continue; + + free((char *)kh_key(h, k)); + } + kh_destroy(chk, h); + } + + return ret; +} + +/* ---------------------------------------------------------------------- + * Checksum combining. This is used to merge multiple checksum output files + * from e.g. "samtools split" readgroup files, into a single combined + * checksum to give the same result as doing a samtools merge | checksum. + */ + +// Process an individual file, aggregating to s, noRG and h +static int sums_parse(opts *o, char *fn, sums_t *sums, sums_t *noRG, + khash_t(chk) *h) { + int ret = -1; + FILE *fp; + if ((fp = fopen(fn, "r")) == NULL) { + perror(fn); + return -1; + } + + kstring_t line = KS_INITIALIZE; + int nheader = 0; + enum { + H_GROUP, H_QC, H_COUNT, H_SEQ, H_NAME, H_QUAL, H_AUX, + H_POS, H_CIGAR, H_MATE, H_COMBINED + } header[11] = {-1,-1,-1,-1,-1, -1,-1,-1,-1,-1, -1}; + crcs_t crcs = {1,1,1,1,1,1,1}; + + while (line.l = 0, kgetline(&line, (kgets_func *)fgets, fp) >= 0) { + if (strncmp(line.s, "# Checksum", 10) == 0) { + int major, minor; + if (sscanf(line.s, "# Checksum %d.%d", &major, &minor) == 2) { + if (major != 1 || minor != 0) { + fprintf(samtools_stderr, "Unsupported checksum output version\n"); + goto err; + } + } + continue; + } + + if (strncmp(line.s, "# Group", 7) == 0) { + // Parse column header so we know which fields are present + int n, i = 0, idx; + char *ptr = line.s+2; + char token[20]; + while ((n = sscanf(ptr, "%19s%n", token, &idx)) == 1) { + if (strcmp(token, "Group") == 0) + header[i] = H_GROUP; + else if (strcmp(token, "QC") == 0) + header[i] = H_QC; + else if (strcmp(token, "count") == 0) + header[i] = H_COUNT; + else if (strcmp(token, "flag+seq") == 0) + header[i] = H_SEQ; + else if (strcmp(token, "+name") == 0) + header[i] = H_NAME; + else if (strcmp(token, "+qual") == 0) + header[i] = H_QUAL; + else if (strcmp(token, "+aux") == 0) + header[i] = H_AUX; + else if (strcmp(token, "+chr/pos") == 0) + header[i] = H_POS, o->check_pos = 1; + else if (strcmp(token, "+cigar") == 0) + header[i] = H_CIGAR, o->check_cigar = 1; + else if (strcmp(token, "+mate") == 0) + header[i] = H_MATE, o->check_mate = 1; + else if (strcmp(token, "combined") == 0) + header[i] = H_COMBINED; + else { + fprintf(samtools_stderr, "Unrecognised header token '%s'\n", token); + goto err; + } + + i++; + ptr += idx; + } + nheader = i; + + continue; + } + + if (strncmp(line.s, "# Aux", 5) == 0) { + int idx; + char c; + if (sscanf(line.s, "# Aux tags: %c%n", &c, &idx) == 1) + if (!o->tag_str) + o->tag_free = o->tag_str = strdup(line.s + idx-1); + + continue; + } + + if (strncmp(line.s, "# BAM", 5) == 0) { + int idx; + char c; + if (sscanf(line.s, "# BAM flags: %c%n", &c, &idx) == 1) + o->flag_mask = bam_str2flag(line.s + idx-1); + + continue; + } + + if (!line.l || *line.s == '#') + continue; + + + // Header done. Now parse the data lines + if (strncmp(line.s, "all ", 4) == 0 || + strncmp(line.s, "all\t", 4) == 0) + continue; + + char col[11][128], *ptr = line.s; + int nf; + for (nf = 0; nf < 11; nf++) { + int idx; + int n = sscanf(ptr, "%127s%n", col[nf], &idx); + if (n <= 0) + break; + if (strlen(col[nf]) == 127) { + fprintf(samtools_stderr, "Field too long\n"); + goto err; + } + ptr += idx; + } + + // Sanity check that header and rows match + if (nf < 8 || nf != nheader) { + fprintf(samtools_stderr, "Incorrect number of columns in line: %s\n", + line.s); + goto err; + } + + // Marry up column header with row entries and set struct. + // (We could update the struct to be numbered instead of + // named in variables to make this easier.) + int qc = 0; + uint64_t count = 0; + for (int i = 0; i < nf; i++) { + switch (header[i]) { + case H_QC: + if (strcmp(col[i], "all") == 0) + qc = 0; + else if (strcmp(col[i], "pass") == 0) + qc = 1; + else if (strcmp(col[i], "fail") == 0) + qc = 2; + else + goto err; + break; + + case H_COUNT: + count = strtoull(col[i], NULL, 10); + break; + + case H_SEQ: + crcs.seq = strtoul(col[i], NULL, 16); + break; + + case H_NAME: + crcs.name = strtoul(col[i], NULL, 16); + break; + + case H_QUAL: + crcs.qual = strtoul(col[i], NULL, 16); + break; + + case H_AUX: + crcs.aux = strtoul(col[i], NULL, 16); + break; + + case H_POS: + crcs.pos = strtoul(col[i], NULL, 16); + break; + + case H_CIGAR: + crcs.cigar = strtoul(col[i], NULL, 16); + break; + + case H_MATE: + crcs.mate = strtoul(col[i], NULL, 16); + break; + + default: + break; + } + } + + // Add group entry + if (strcmp(col[0], "-") == 0) { + sums_update_row(qc, noRG, &crcs, 0, count); + } else { + int kret; + khiter_t k = kh_get(chk, h, col[0]); + if (k == kh_end(h)) { + char *rgz_ = strdup(col[0]); + if (!rgz_) + goto err; + k = kh_put(chk, h, rgz_, &kret); + if (kret < 0) { + free(rgz_); + goto err; + } + sums_init(&kh_value(h, k)); + } + sums_update_row(qc, &kh_value(h, k), &crcs, 0, count); + } + + // Add to global "all" stats + sums_update_row(qc, sums, &crcs, 0, count); + } + + ret = 0; + + err: + ks_free(&line); + fclose(fp); + return ret; +} + +// Combine multiple checksum files together and report the merged stats +int combine(opts *o, int argc, char **argv) { + int ret = -1; + sums_t s, noRG; + sums_init(&s); + sums_init(&noRG); + + free(o->tag_free); // Probably NULL, but just incase + o->tag_free = o->tag_str = NULL; + khash_t(chk) *h = kh_init(chk); + if (!h) + goto err; + for (int i = 0; i < argc; i++) { + if (sums_parse(o, argv[i], &s, &noRG, h) < 0) { + fprintf(samtools_stderr, "Failed to parse checksum file '%s'\n", argv[i]); + goto err; + } + } + checksum_report("merge", o, &s, &noRG, h); + + ret = 0; + err: + free(o->tag_free); + o->tag_free = NULL; + + if (h) { + for (khiter_t k = kh_begin(h); k != kh_end(h); k++) { + if (!kh_exist(h, k)) + continue; + + free((char *)kh_key(h, k)); + } + kh_destroy(chk, h); + } + + return ret; +} + +/* ---------------------------------------------------------------------- + * CLI + */ +void usage_exit(FILE *fp, int ret) { + fprintf(samtools_stderr, "Usage: samtools checksum [options] [file.bam ...]\n"); + fprintf(samtools_stderr, "or samtools checksum [options] -m [file.chk ...]\n\n"); + fprintf(samtools_stderr, "Options:\n\ + -F, --exclude-flags FLAG Filter if any FLAGs are present [0x900]\n\ + -f, --require-flags FLAG Filter unless all FLAGs are present [0]\n\ + -b, --flag-mask FLAG BAM FLAGs to use in checksums [0x0c1]\n\ + -c, --no-rev-comp Do not reverse-complement sequences [off]\n\ + -t, --tags STR[,STR] Select tags to checksum [BC,FI,QT,RT,TC]\n\ + -O, --in-order Use order-specific checksumming [off]\n\ + -P, --check-pos Also checksum CHR / POS [off]\n\ + -C, --check-cigar Also checksum MAPQ / CIGAR [off]\n\ + -M, --check_mate Also checksum PNEXT / RNEXT / TLEN [off]\n\ + -z, --sanitize FLAGS Perform sanity checks and fix records [off]\n\ + -N, --count INT Stop after INT number of records [0]\n\ + -o, --output FILE Write report to FILE [samtools_stdout]\n\ + -q, --show-qc Also show QC pass/fail lines\n\ + -v, --verbose Increase verbosity: show lines with 0 counts\n\ + -a, --all Check all: -PCMOc -b 0xfff -f0 -F0 -z all,cigarx\n\ + -T, --tabs Format output as tab delimited text\n\ + -m, --merge FILE Merge checksum output (-o opt) files\n\ + -B, --bamseqchksum Report in bamseqchksum format\n"); + fprintf(fp, "\nGlobal options:\n"); + sam_global_opt_help(fp, "-.---@--"); + samtools_exit(ret); +} + +int parse_tags(opts *o) { + // Count + int nt = 0; + for (char *t = o->tag_str; *t; t++) { + nt++; + char *l = t; + while (*t && *t != ',') + t++; + if (t-l != 2 && !(t-l == 1 && *l == '*')) { + fprintf(samtools_stderr, "Bad tag string. Should be XX,YY,... syntax\n"); + return 1; + } + if (!*t) + break; + } + + // Split by tag + o->ntags = nt; + o->tags = calloc(nt, sizeof(*o->tags)); + if (!o->tags) + return 1; + + nt = 0; + for (char *t = o->tag_str; *t; t++, nt++) { + o->tags[nt] = t; + while (*t && *t != ',') + t++; + if (!*t) + break; + } + + return 0; +} + +// Main command entry +int main_checksum(int argc, char **argv) { + opts opts = { + .req_flags = 0, + .excl_flags = BAM_FSECONDARY | BAM_FSUPPLEMENTARY, + .flag_mask = BAM_FPAIRED | BAM_FREAD1 | BAM_FREAD2, + .rev_comp = 1, + .tag_str = "BC,FI,QT,RT,TC", + .tag_free = NULL, + .check_pos = 0, + .check_cigar = 0, + .check_mate = 0, + .in_order = 0, + .sanitize = 0, + .nrec = 0, + .verbose = 0, + .show_pass = 0, + .show_fail = 0, + .show_combine = 1, + .fp = samtools_stdout, + .tabs = 0, + .merge = 0, + }; + + sam_global_args ga = SAM_GLOBAL_ARGS_INIT; + static const struct option lopts[] = { + SAM_OPT_GLOBAL_OPTIONS('-', 'I', '-', '-', '.', '@'), + {"exclude-flags", required_argument, NULL, 'F'}, + {"require-flags", required_argument, NULL, 'f'}, + {"flag-mask", required_argument, NULL, 'b'}, + {"tags", required_argument, NULL, 't'}, + {"no-rev-comp", no_argument, NULL, 'c'}, + {"in-order", no_argument, NULL, 'O'}, + {"check-pos", no_argument, NULL, 'P'}, + {"check-cigar", no_argument, NULL, 'C'}, + {"check-mate", no_argument, NULL, 'M'}, + {"count", required_argument, NULL, 'N'}, + {"sanitize", required_argument, NULL, 'z'}, + {"output", required_argument, NULL, 'o'}, + {"show-qc", no_argument, NULL, 'q'}, + {"verbose", no_argument, NULL, 'v'}, + {"all", no_argument, NULL, 'a'}, + {"tabs", no_argument, NULL, 'T'}, + {"merge", no_argument, NULL, 'm'}, + {"bamseqchksum", no_argument, NULL, 'B'}, + {NULL, 0, NULL, 0} + }; + + if (argc == 1 && isatty(STDIN_FILENO)) + usage_exit(samtools_stdout, EXIT_SUCCESS); + + int c; + while ((c = getopt_long(argc, argv, "@:f:F:t:cPCMOb:z:aN:vqo:TmB", + lopts, NULL)) >= 0) { + switch (c) { + case 'O': + opts.in_order++; + break; + case 'F': + if ((opts.excl_flags = bam_str2flag(optarg)) < 0) { + print_error("checksum", "could not parse flag %s", optarg); + return 1; + } + break; + case 'f': + if ((opts.req_flags = bam_str2flag(optarg)) < 0) { + print_error("checksum", "could not parse flag %s", optarg); + return 1; + } + break; + case 'b': + if ((opts.flag_mask = bam_str2flag(optarg)) < 0) { + print_error("checksum", "could not parse flag %s", optarg); + return 1; + } + break; + case 'P': + opts.check_pos = 1; + break; + case 'C': + opts.check_cigar = 1; + break; + case 'M': + opts.check_mate = 1; + break; + case 't': + opts.tag_str = optarg; + break; + case 'c': + opts.rev_comp = 0; + break; + case 'N': + opts.nrec = strtoll(optarg, NULL, 0); + break; + + case 'B': + opts.compat = 1; + opts.show_pass = 1; + break; + case 'v': + opts.verbose++; + break; + case 'q': + opts.show_pass = opts.show_fail = 1; + break; + case 'T': + opts.tabs = 1; + break; + case 'm': + opts.merge = 1; + break; + + case 'z': + if ((opts.sanitize = bam_sanitize_options(optarg)) < 0) + return 1; + break; + + case 'a': + // ALL: a shorthand for a bunch of options to checksum the entire + // file contents. TODO: we still need tag wildcards. + opts.req_flags = 0; + opts.excl_flags = 0; + opts.flag_mask = -1; + opts.rev_comp = 0; + opts.in_order = 1; + opts.check_pos = 1; + opts.check_cigar = 1; + opts.check_mate = 1; + opts.sanitize = FIX_ALL | FIX_CIGARX; + opts.tag_str = "*,cF,MD,NM"; + break; + + case 'o': + opts.fp = fopen(optarg, "w"); + if (!opts.fp) { + perror(optarg); + return 1; + } + break; + + default: + if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) + break; + /* else fall-through */ + case '?': + usage_exit(samtools_stderr, EXIT_FAILURE); + } + } + + if (!opts.tags) { + if (parse_tags(&opts) < 0) + return 1; + } + + int ret = 0; + if (opts.merge) { + ret = combine(&opts, argc - optind, argv+optind); + } else { + if (argc-optind) { + while (optind < argc) + ret |= checksum(&ga, &opts, argv[optind++]) < 0; + } else { + ret = checksum(&ga, &opts, "-") < 0; + } + } + + if (opts.fp != samtools_stdout) + ret |= fclose(opts.fp) < 0; + + free(opts.tags); + free(opts.tag_free); + + if (ret) + fprintf(samtools_stderr, "[checksum] Failed to process data\n"); + + return ret; +} diff --git a/samtools/bam_consensus.c b/samtools/bam_consensus.c index 8572e0f0..1f33af88 100644 --- a/samtools/bam_consensus.c +++ b/samtools/bam_consensus.c @@ -1,7 +1,7 @@ /* bam_consensus.c -- consensus subcommand. Copyright (C) 1998-2001,2003 Medical Research Council (Gap4/5 source) - Copyright (C) 2003-2005,2007-2024 Genome Research Ltd. + Copyright (C) 2003-2005,2007-2025 Genome Research Ltd. Author: James Bonfield @@ -132,9 +132,14 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include +#include #include #include +#include +#include +#include #include "samtools.h" #include "sam_opts.h" @@ -188,6 +193,20 @@ typedef struct { int omap[101]; // overcall or INS } qcal_t; +// Persistent data used per thread and reused in each job operating in +// the same thread. This is for things like file handles where we +// can't share the same pointer due to read buffers. We also have one +// of these for the main thread for consistency in non-threaded +// operation. Index 0 is main, 1-N is threads. +typedef struct { + samFile *fp; // BAM files e.g. + hts_idx_t *idx; // BAI index e.g. + faidx_t *fai; // FAI index + int ref_tid; // reference chr value + char *ref; // reference sequence + hts_pos_t ref_len; // ref seq length +} thread_data_t; + typedef struct { // User options char *reg; @@ -224,19 +243,57 @@ typedef struct { double homopoly_fix; double homopoly_redux; qcal_t qcal; + char *ref_fn; + int ref_qual; + int span; // base block size for threads - // Internal state - samFile *fp; + // Internal state, shared between threads + char *fn; FILE *fp_out; + sam_global_args ga_in; sam_hdr_t *h; - hts_idx_t *idx; - hts_itr_t *iter; - kstring_t ks_line; + int nthreads; + hts_tpool *pool; + thread_data_t *tdata; // one per thread + 1 for main +} consensus_opts; + +// Thread specific state (or once only in main if not threaded) +typedef struct { + // Both threaded and non-threaded + consensus_opts *opts; // cached copy; FIXME + kstring_t ks_pileup; kstring_t ks_ins_seq; kstring_t ks_ins_qual; + hts_pos_t ks_ins_start; // first real base, buffer index + hts_pos_t first_pos, last_pos; // genomic coords of first/last base int last_tid; - hts_pos_t last_pos; -} consensus_opts; + char *ref; + hts_pos_t ref_len; + int ref_tid; + hts_itr_t *iter; + + // Threaded only + int counter; // block number + samFile *fp; // thread specific file pointer + sam_hdr_t *h; + pthread_t pid; + int tid; + hts_pos_t start, end; // region to process + int first, last; // region is at start of contig or end of contig + int (*seq_column)(void *client_data, + samFile *fp, + sam_hdr_t *h, + pileup_t *p, + int depth, + hts_pos_t pos, + int nth, + int is_insert); +} ctx; + +// Returns the thread_data struct associated with this thread ID +static thread_data_t *thread_data(consensus_opts *opts) { + return &opts->tdata[hts_tpool_worker_id(opts->pool)+1]; +} /* -------------------------------------------------------------------------- * A bayesian consensus algorithm that analyses the data to work out @@ -358,7 +415,7 @@ static cons_probs cons_prob_recall, cons_prob_precise; * * The heterozygosity weight though is a per column calculation as we're * trying to model whether the column is pure or mixed. Hence this is done - * once via a prior and has no affect on the individual matrix cells. + * once via a prior and has no effect on the individual matrix cells. * * We have a generic indel probability, but it's a catch all for overcall, * undercall, alignment artifacts, homopolymer issues, etc. So we can set @@ -399,7 +456,8 @@ static qcal_t static_qcal[6] = { 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, - 90, 91, 92, 93, 94, 95, 96, 97, 98, 99}, + 90, 91, 92, 93, 94, 95, 96, 97, 98, 99 + }, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, @@ -409,7 +467,8 @@ static qcal_t static_qcal[6] = { 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, - 90, 91, 92, 93, 94, 95, 96, 97, 98, 99}, + 90, 91, 92, 93, 94, 95, 96, 97, 98, 99 + }, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, @@ -419,7 +478,8 @@ static qcal_t static_qcal[6] = { 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, - 90, 91, 92, 93, 94, 95, 96, 97, 98, 99} + 90, 91, 92, 93, 94, 95, 96, 97, 98, 99 + } }, { // HiFi @@ -432,29 +492,29 @@ static qcal_t static_qcal[6] = { 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, - 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, - }, + 44, 44, 44, 44, 44, 44, 44, 44, 44, 44 + }, { 4, 4, 4, 4, 5, 6, 6, 7, 8, 9, - 10, 11, 11, 12, 13, 14, 15, 15, 16, 17, - 18, 19, 19, 20, 20, 21, 22, 23, 23, 24, - 25, 25, 25, 26, 26, 26, 27, 27, 28, 28, - 28, 28, 27, 27, 27, 28, 28, 28, 28, 27, - 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, - 27, 27, 26, 26, 25, 26, 26, 27, 27, 27, - 26, 26, 26, 26, 26, 26, 26, 26, 27, 27, - 28, 29, 28, 28, 28, 27, 27, 27, 27, 27, - 27, 28, 28, 30, 30, 30, 30, 30, 30, 30, - }, + 10, 11, 11, 12, 13, 14, 15, 15, 16, 17, + 18, 19, 19, 20, 20, 21, 22, 23, 23, 24, + 25, 25, 25, 26, 26, 26, 27, 27, 28, 28, + 28, 28, 27, 27, 27, 28, 28, 28, 28, 27, + 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, + 27, 27, 26, 26, 25, 26, 26, 27, 27, 27, + 26, 26, 26, 26, 26, 26, 26, 26, 27, 27, + 28, 29, 28, 28, 28, 27, 27, 27, 27, 27, + 27, 28, 28, 30, 30, 30, 30, 30, 30, 30, + }, { 8, 8, 8, 8, 9, 10, 11, 12, 13, 14, - 15, 15, 16, 17, 18, 19, 19, 20, 20, 21, - 21, 22, 22, 23, 23, 23, 24, 24, 24, 25, - 25, 25, 25, 25, 25, 26, 26, 26, 26, 27, - 27, 27, 27, 27, 27, 28, 28, 28, 28, 28, - 29, 29, 29, 29, 29, 29, 30, 30, 30, 30, - 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, - 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, - 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, - 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, + 15, 15, 16, 17, 18, 19, 19, 20, 20, 21, + 21, 22, 22, 23, 23, 23, 24, 24, 24, 25, + 25, 25, 25, 25, 25, 26, 26, 26, 26, 27, + 27, 27, 27, 27, 27, 28, 28, 28, 28, 28, + 29, 29, 29, 29, 29, 29, 30, 30, 30, 30, + 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, + 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, + 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, + 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, } }, @@ -496,36 +556,36 @@ static qcal_t static_qcal[6] = { { // ONT R10.4 super { 0, 2, 2, 2, 3, 4, 4, 5, 6, 7, 7, 8, 9, 12, 13, 14, 15, 15, 16, 17, - 18, 19, 20, 22, 24, 25, 26, 27, 28, 29, - 30, 31, 33, 34, 36, 37, 38, 38, 39, 39, - 40, 40, 40, 40, 40, 40, 40, 41, 40, 40, - 41, 41, 40, 40, 40, 40, 41, 40, 40, 40, - 40, 41, 41, 40, 40, 41, 40, 40, 39, 41, - 40, 41, 40, 40, 41, 41, 41, 40, 40, 40, - 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, - 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, + 18, 19, 20, 22, 24, 25, 26, 27, 28, 29, + 30, 31, 33, 34, 36, 37, 38, 38, 39, 39, + 40, 40, 40, 40, 40, 40, 40, 41, 40, 40, + 41, 41, 40, 40, 40, 40, 41, 40, 40, 40, + 40, 41, 41, 40, 40, 41, 40, 40, 39, 41, + 40, 41, 40, 40, 41, 41, 41, 40, 40, 40, + 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, + 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, }, { 0, 2, 2, 2, 3, 4, 5, 6, 7, 8, 8, 9, 9, 10, 10, 10, 11, 12, 12, 13, - 13, 13, 14, 14, 15, 16, 16, 17, 18, 18, - 19, 19, 20, 21, 22, 23, 24, 25, 25, 25, - 25, 25, 25, 25, 25, 25, 26, 26, 26, 26, - 26, 26, 26, 26, 27, 27, 27, 27, 27, 27, - 27, 27, 27, 27, 27, 27, 27, 28, 28, 28, - 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, - 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, - 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, + 13, 13, 14, 14, 15, 16, 16, 17, 18, 18, + 19, 19, 20, 21, 22, 23, 24, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 26, 26, 26, 26, + 26, 26, 26, 26, 27, 27, 27, 27, 27, 27, + 27, 27, 27, 27, 27, 27, 27, 28, 28, 28, + 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, + 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, + 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, }, { 0, 4, 6, 6, 6, 7, 7, 8, 9, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, - 15, 15, 15, 16, 16, 17, 17, 18, 18, 19, - 19, 20, 20, 21, 22, 22, 23, 23, 24, 24, - 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, - 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, - 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, - 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, - 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, - 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 15, 15, 15, 16, 16, 17, 17, 18, 18, 19, + 19, 20, 20, 21, 22, 22, 23, 23, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, } }, { // ONT R10.4 duplex; just a copy of hifi for now @@ -539,28 +599,28 @@ static qcal_t static_qcal[6] = { 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, - }, + }, { 4, 4, 4, 4, 5, 6, 6, 7, 8, 9, - 10, 11, 11, 12, 13, 14, 15, 15, 16, 17, - 18, 19, 19, 20, 20, 21, 22, 23, 23, 24, - 25, 25, 25, 26, 26, 26, 27, 27, 28, 28, - 28, 28, 27, 27, 27, 28, 28, 28, 28, 27, - 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, - 27, 27, 26, 26, 25, 26, 26, 27, 27, 27, - 26, 26, 26, 26, 26, 26, 26, 26, 27, 27, - 28, 29, 28, 28, 28, 27, 27, 27, 27, 27, - 27, 28, 28, 30, 30, 30, 30, 30, 30, 30, - }, + 10, 11, 11, 12, 13, 14, 15, 15, 16, 17, + 18, 19, 19, 20, 20, 21, 22, 23, 23, 24, + 25, 25, 25, 26, 26, 26, 27, 27, 28, 28, + 28, 28, 27, 27, 27, 28, 28, 28, 28, 27, + 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, + 27, 27, 26, 26, 25, 26, 26, 27, 27, 27, + 26, 26, 26, 26, 26, 26, 26, 26, 27, 27, + 28, 29, 28, 28, 28, 27, 27, 27, 27, 27, + 27, 28, 28, 30, 30, 30, 30, 30, 30, 30, + }, { 8, 8, 8, 8, 9, 10, 11, 12, 13, 14, - 15, 15, 16, 17, 18, 19, 19, 20, 20, 21, - 21, 22, 22, 23, 23, 23, 24, 24, 24, 25, - 25, 25, 25, 25, 25, 26, 26, 26, 26, 27, - 27, 27, 27, 27, 27, 28, 28, 28, 28, 28, - 29, 29, 29, 29, 29, 29, 30, 30, 30, 30, - 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, - 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, - 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, - 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, + 15, 15, 16, 17, 18, 19, 19, 20, 20, 21, + 21, 22, 22, 23, 23, 23, 24, 24, 24, 25, + 25, 25, 25, 25, 25, 26, 26, 26, 26, 27, + 27, 27, 27, 27, 27, 28, 28, 28, 28, 28, + 29, 29, 29, 29, 29, 29, 30, 30, 30, 30, + 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, + 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, + 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, + 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, } }, { // Ultima Genomics @@ -947,7 +1007,7 @@ int poly_len(const pileup_t *p, const bam1_t *b, hts_pos_t pos) { * Returns 0 (discard) or 1 (keep) on success, -1 on failure. */ int nm_init(void *client_data, samFile *fp, sam_hdr_t *h, pileup_t *p) { - consensus_opts *opts = (consensus_opts *)client_data; + consensus_opts *opts = ((ctx *)client_data)->opts; if (!opts->use_mqual) return 1; @@ -1189,14 +1249,14 @@ double lnbinprobhalf(int n, double k) { } #endif +#include "bam_consensus_tab.h" + static int calculate_consensus_gap5(hts_pos_t pos, int flags, int depth, pileup_t *plp, consensus_opts *opts, consensus_t *cons, int default_qual, cons_probs *cp) { - int i, j; - static int init_done =0; - static double q2p[101], mqual_pow[256]; + int j; double min_e_exp = DBL_MIN_EXP * log(2) + 1; double S[15] ALIGNED(16) = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; @@ -1226,23 +1286,6 @@ int calculate_consensus_gap5(hts_pos_t pos, int flags, int depth, 18, 19, 24}; - if (!init_done) { - init_done = 1; - - for (i = 0; i <= 100; i++) { - q2p[i] = pow(10, -i/10.0); - } - - for (i = 0; i < 255; i++) { - //mqual_pow[i] = 1-pow(10, -(i+.01)/10.0); - mqual_pow[i] = 1-pow(10, -(i*.9)/10.0); - //mqual_pow[i] = 1-pow(10, -(i/3+.1)/10.0); - //mqual_pow[i] = 1-pow(10, -(i/2+.05)/10.0); - } - // unknown mqual - mqual_pow[255] = mqual_pow[10]; - } - /* Initialise */ int counts[6] = {0}; #ifdef DO_FRACT @@ -1351,9 +1394,16 @@ int calculate_consensus_gap5(hts_pos_t pos, int flags, int depth, if (mqual > opts->high_mqual) mqual = opts->high_mqual; - double _p = 1-q2p[qual]; - double _m = mqual_pow[mqual]; - qual = ph_log(1-(_m * _p + (1 - _m)/4)); // CURRENT +// double _p = 1-q2p[qual]; +// double _m = mqual_pow[mqual]; +// qual = ph_log(1-(_m * _p + (1 - _m)/4)); // CURRENT + + // Equivalent to the above, but avoiding numbers very close to 1 + // This is also marginally faster. + double _P = q2p[qual]; + double _M = mqual_pow_1m[mqual]; + qual = ph_log(_P+.75*_M-_P*_M); + //qual = ph_log(1-_p*_m); // testing //qual *= 6/sqrt(td); } @@ -1818,58 +1868,6 @@ int calculate_consensus_gap5m(hts_pos_t pos, int flags, int depth, return 0; } -/* -------------------------------------------------------------------------- - * Main processing logic - */ - -static void dump_fastq(consensus_opts *opts, - const char *name, - const char *seq, size_t seq_l, - const char *qual, size_t qual_l) { - enum format fmt = opts->fmt; - int line_len = opts->line_len; - FILE *fp = opts->fp_out; - - fprintf(fp, "%c%s\n", ">@"[fmt==FASTQ], name); - size_t i; - for (i = 0; i < seq_l; i += line_len) - fprintf(fp, "%.*s\n", (int)MIN(line_len, seq_l - i), seq+i); - - if (fmt == FASTQ) { - fprintf(fp, "+\n"); - for (i = 0; i < seq_l; i += line_len) - fprintf(fp, "%.*s\n", (int)MIN(line_len, seq_l - i), qual+i); - } -} - -//--------------------------------------------------------------------------- - -/* - * Reads a single alignment record, using either the iterator - * or a direct sam_read1 call. - */ -static int readaln2(void *dat, samFile *fp, sam_hdr_t *h, bam1_t *b) { - consensus_opts *opts = (consensus_opts *)dat; - - for (;;) { - int ret = opts->iter - ? sam_itr_next(fp, opts->iter, b) - : sam_read1(fp, h, b); - if (ret < 0) - return ret; - - // Apply hard filters - if (opts->incl_flags && !(b->core.flag & opts->incl_flags)) - continue; - if (opts->excl_flags && (b->core.flag & opts->excl_flags)) - continue; - if (b->core.qual < opts->min_mqual) - continue; - - return ret; - } -} - /* -------------------------------------------------------------------------- * A simple summing algorithm, either pure base frequency, or by * weighting them according to their quality values. @@ -2008,19 +2006,180 @@ static int calculate_consensus_simple(const pileup_t *plp, return het[used_base]; } -static int empty_pileup2(consensus_opts *opts, sam_hdr_t *h, int tid, + +/* -------------------------------------------------------------------------- + * Main processing logic + */ + +/* + * Ensure opts->ref is up to date. + * Returns >=0 on success (length) + * -1 on failure + */ +static hts_pos_t update_ref(ctx *c, int tid) { + consensus_opts *opts = c->opts; + if (!opts->ref_fn) + return 0; + + thread_data_t *tdata = thread_data(opts); + if (tid == tdata->ref_tid && tdata->ref) { + c->ref = tdata->ref; + return tdata->ref_len; + } + + free(tdata->ref); + tdata->ref = NULL; + tdata->ref_tid = tid; + + c->ref = NULL; + + const char *chr = sam_hdr_tid2name(opts->h, tid); + if (!chr) + return -1; + + if (!(tdata->ref = fai_fetch64(tdata->fai, chr, &tdata->ref_len))) + return -1; + c->ref = tdata->ref; + c->ref_tid = tid; + c->ref_len = tdata->ref_len; + + return c->ref_len; +} +// Outputs a FASTA or FASTQ consensus sequence +static void dump_fastq(consensus_opts *opts, + const char *name, + const char *seq, size_t seq_l, + const char *qual, size_t qual_l) { + enum format fmt = opts->fmt; + int line_len = opts->line_len; + FILE *fp = opts->fp_out; + + if (!seq_l) + return; + + fprintf(fp, "%c%s\n", ">@"[fmt==FASTQ], name); + size_t i; + for (i = 0; i < seq_l; i += line_len) + fprintf(fp, "%.*s\n", (int)MIN(line_len, seq_l - i), seq+i); + + if (fmt == FASTQ) { + fprintf(fp, "+\n"); + for (i = 0; i < seq_l; i += line_len) + fprintf(fp, "%.*s\n", (int)MIN(line_len, seq_l - i), qual+i); + } +} + +//--------------------------------------------------------------------------- + +/* + * Reads a single alignment record, using either the iterator + * or a direct sam_read1 call. This also applies the include/exclude filters. + */ +static int readaln2(void *dat, samFile *fp, sam_hdr_t *h, bam1_t *b) { + ctx *c = (ctx *)dat; + consensus_opts *opts = c->opts; + + for (;;) { + int ret = c->iter + ? sam_itr_next(fp, c->iter, b) + : sam_read1(fp, h, b); + if (ret < 0) + return ret; + + // Apply hard filters + if (opts->incl_flags && !(b->core.flag & opts->incl_flags)) + continue; + if (opts->excl_flags && (b->core.flag & opts->excl_flags)) + continue; + if (b->core.qual < opts->min_mqual) + continue; + + return ret; + } +} + +// Output/append a portion of empty pileup. This may be N/0 or ref/qual. +static int empty_pileup2(ctx *c, sam_hdr_t *h, int tid, int threaded, hts_pos_t start, hts_pos_t end) { + consensus_opts *opts = c->opts; const char *name = sam_hdr_tid2name(h, tid); hts_pos_t i; - int err = 0; - for (i = start; i < end; i++) - err |= fprintf(opts->fp_out, "%s\t%"PRIhts_pos"\t0\t0\tN\t0\t*\t*\n", name, i+1) < 0; + + char *rseq = NULL; + if (opts->ref_fn && (err |= (update_ref(c, tid) <= 0)) == 0) + rseq = c->ref; + + if (threaded) { + kstring_t *ks = &c->ks_pileup; + for (i = start; i < end; i++) + err |= ksprintf(ks, + "%s\t%"PRIhts_pos"\t0\t0\t%c\t0\t*\t*\n", + name, i+1, rseq ? rseq[i] : 'N') < 0; + } else { + for (i = start; i < end; i++) + err |= fprintf(opts->fp_out, + "%s\t%"PRIhts_pos"\t0\t0\t%c\t0\t*\t*\n", + name, i+1, rseq ? rseq[i] : 'N') < 0; + } return err ? -1 : 0; } /* + * Compute consensus for a specific base. Fills out base and qual. + * Returns 0 on success, + * -1 on failure. + */ +int consensus_base(consensus_opts *opts, + pileup_t *p, hts_pos_t pos, int depth, + int *base, int *qual) { + int cb, cq; + + if (opts->mode != MODE_SIMPLE) { + consensus_t cons; + calculate_consensus_gap5m(pos, opts->use_mqual ? CONS_MQUAL : 0, + depth, p, opts, &cons, opts->default_qual, + &cons_prob_recall, &cons_prob_precise); + if (cons.depth < opts->min_depth && cons.call != 4) { + // && cons.call != 4. See #2167 + cb = 'N'; + cq = 0; + } else if (cons.het_logodd > 0 && opts->ambig) { + cb = "AMRWa" // 5x5 matrix with ACGT* per row / col + "MCSYc" + "RSGKg" + "WYKTt" + "acgt*"[cons.het_call]; + cq = cons.het_logodd; + } else { + cb = "ACGT*"[cons.call]; + cq = cons.phred; + } + if (cq < opts->cons_cutoff && cb != '*' && + cons.het_call % 5 != 4 && cons.het_call / 5 != 4) { + // het base/* keeps base or * as most likely pure call, else N. + // We still set quality to zero however as this is more useful + // than simply changing the base to N. + cb = 'N'; + cq = 0; + } + } else { + cb = calculate_consensus_simple(p, opts, &cq); + } + if (cb < 0) + return -1; + + *base = cb; + *qual = cq; + + return 0; +} + +/* + * Callback from the pileup algorithm. + * Adds pileup format consensus for a specific column. + * * Returns 0 on success * -1 on failure */ @@ -2029,82 +2188,55 @@ static int basic_pileup(void *cd, samFile *fp, sam_hdr_t *h, pileup_t *p, unsigned char *qp, *cp; char *rp; int ref, cb, cq; - consensus_opts *opts = (consensus_opts *)cd; + ctx *c = (ctx *)cd; + consensus_opts *opts = c->opts; int tid = p->b.core.tid; -// opts->show_ins=0; -// opts->show_del=1; if (!opts->show_ins && nth) return 0; - if (opts->iter) { - if (opts->iter->beg >= pos || opts->iter->end < pos) + if (c->iter) { + if (c->iter->beg >= pos || c->iter->end < pos) return 0; } if (opts->all_bases) { - if (tid != opts->last_tid && opts->last_tid >= -1) { - if (opts->last_tid >= 0) { + if (tid != c->last_tid && c->last_tid >= -1) { + if (c->last_tid >= 0) { // remainder of previous ref - hts_pos_t len = sam_hdr_tid2len(opts->h, opts->last_tid); - if (opts->iter) - len = MIN(opts->iter->end, len); - if (empty_pileup2(opts, opts->h, opts->last_tid, - opts->last_pos, len) < 0) + hts_pos_t len = sam_hdr_tid2len(opts->h, c->last_tid); + if (c->iter) + len = MIN(c->iter->end, len); + if (empty_pileup2(c, opts->h, c->last_tid, opts->nthreads, + c->last_pos, len) < 0) return -1; } - opts->last_pos = opts->iter ? opts->iter->beg : 0; + c->last_pos = c->iter ? c->iter->beg : 0; } // Any refs between last_tid and tid - if (!opts->iter && tid > opts->last_tid && opts->all_bases > 1) { - while (++opts->last_tid < tid) { - hts_pos_t len = sam_hdr_tid2len(opts->h, opts->last_tid); - if (empty_pileup2(opts, opts->h, opts->last_tid, 0, len) < 0) + if (!c->iter && tid > c->last_tid && opts->all_bases > 1) { + while (++c->last_tid < tid) { + hts_pos_t len = sam_hdr_tid2len(opts->h, c->last_tid); + if (empty_pileup2(c, opts->h, c->last_tid, 0, 0, len) < 0) return -1; } } // Any gaps in this ref (same tid) or at start of this new tid - if (opts->last_pos >= 0 && pos > opts->last_pos+1) { - if (empty_pileup2(opts, opts->h, p->b.core.tid, opts->last_pos, - pos-1) < 0) + if (c->last_pos >= 0 && pos > c->last_pos+1) { + if (empty_pileup2(c, opts->h, p->b.core.tid, opts->nthreads, + c->last_pos, pos-1) < 0) return -1; - } else if (opts->last_pos < 0) { - if (empty_pileup2(opts, opts->h, p->b.core.tid, - opts->iter ? opts->iter->beg : 0, pos-1) < 0) + } else if (c->last_pos < 0) { + if (empty_pileup2(c, opts->h, p->b.core.tid, opts->nthreads, + c->iter ? c->iter->beg : 0, pos-1) < 0) return -1; } } - if (opts->mode != MODE_SIMPLE) { - consensus_t cons; - calculate_consensus_gap5m(pos, opts->use_mqual ? CONS_MQUAL : 0, - depth, p, opts, &cons, opts->default_qual, - &cons_prob_recall, &cons_prob_precise); - if (cons.depth < opts->min_depth) { - cb = 'N'; - cq = 0; - } else if (cons.het_logodd > 0 && opts->ambig) { - cb = "AMRWa" // 5x5 matrix with ACGT* per row / col - "MCSYc" - "RSGKg" - "WYKTt" - "acgt*"[cons.het_call]; - cq = cons.het_logodd; - } else{ - cb = "ACGT*"[cons.call]; - cq = cons.phred; - } - if (cq < opts->cons_cutoff && cb != '*') { - cb = 'N'; - cq = 0; - } - } else { - cb = calculate_consensus_simple(p, opts, &cq); - } - if (cb < 0) + if (consensus_base(opts, p, pos, depth, &cb, &cq) < 0) return -1; if (!p) @@ -2114,8 +2246,7 @@ static int basic_pileup(void *cd, samFile *fp, sam_hdr_t *h, pileup_t *p, return 0; /* Ref, pos, nth, score, seq, qual */ - kstring_t *ks = &opts->ks_line; - ks->l = 0; + kstring_t *ks = &c->ks_pileup; ref = p->b.core.tid; rp = (char *)sam_hdr_tid2name(h, ref); @@ -2155,140 +2286,161 @@ static int basic_pileup(void *cd, samFile *fp, sam_hdr_t *h, pileup_t *p, } *cp++ = '\t'; *qp++ = '\n'; - if (fwrite(ks->s, 1, ks->l, opts->fp_out) != ks->l) - return -1; - opts->last_pos = pos; - opts->last_tid = tid; + if (!opts->nthreads) { + if (fwrite(ks->s, 1, ks->l, opts->fp_out) != ks->l) + return -1; + ks->l = 0; + } + + c->last_pos = pos; + c->last_tid = tid; return 0; } +/* + * Callback from the pileup algorithm. + * Adds fastq/fasta format consensus for a specific column. + * + * We either call this for a single thread with the entire region (c->iter) + * or entire file (no iterator), or it gets called repeatedly from threads + * with sub-regions. When we're dealing with the latter we need to track + * which bases we're filling out with Ns so we can trim if needed. + * + * This updates c->ks_ins_{seq,qual} seq and qual + * c->ks_ins_start index to seq/qual for 1st non-N + * c->first_pos first non-N in genome coords + * c->last_pos last genome position processed + * c->last_tid last genome chr processed + * (Amongst other variables) + * + * Returns 0 on success, + * -1 on failure. + */ static int basic_fasta(void *cd, samFile *fp, sam_hdr_t *h, pileup_t *p, int depth, hts_pos_t pos, int nth, int is_insert) { int cb, cq; - consensus_opts *opts = (consensus_opts *)cd; + ctx *c = (ctx *)cd; + consensus_opts *opts = c->opts; int tid = p->b.core.tid; - kstring_t *seq = &opts->ks_ins_seq; - kstring_t *qual = &opts->ks_ins_qual; + kstring_t *seq = &c->ks_ins_seq; + kstring_t *qual = &c->ks_ins_qual; if (!opts->show_ins && nth) return 0; - if (opts->iter) { - if (opts->iter->beg >= pos || opts->iter->end < pos) + if (c->iter) { + if (c->iter->beg >= pos || c->iter->end < pos) return 0; } + if (c->first_pos > pos) + c->first_pos = pos; + next_ref: - if (tid != opts->last_tid) { - if (opts->last_tid != -1) { + if (tid != c->last_tid) { + if (c->last_tid != -1) { if (opts->all_bases) { // Fill in remainder of previous reference int i, N; - if (opts->iter) { - opts->last_pos = MAX(opts->last_pos, opts->iter->beg-1); - N = opts->iter->end; + if (c->iter) { + c->last_pos = MAX(c->last_pos, c->iter->beg-1); + N = c->iter->end; } else { N = INT_MAX; } - N = MIN(N, sam_hdr_tid2len(opts->h, opts->last_tid)) - - opts->last_pos; + N = MIN(N, sam_hdr_tid2len(opts->h, c->last_tid)) + - c->last_pos; if (N > 0) { if (ks_expand(seq, N+1) < 0) return -1; if (ks_expand(qual, N+1) < 0) return -1; - for (i = 0; i < N; i++) { - seq->s[seq->l++] = 'N'; - qual->s[qual->l++] = '!'; + if (c->ref) { + hts_pos_t rlen; + if ((rlen = update_ref(c, c->last_tid)) < 0) + return -1; + for (i = 0; i < N; i++) { + seq->s[seq->l++] = c->ref[c->last_pos+i]; + qual->s[qual->l++] = opts->ref_qual + '!'; + } + } else { + for (i = 0; i < N; i++) { + seq->s[seq->l++] = 'N'; + qual->s[qual->l++] = '!'; + } } seq->s[seq->l] = 0; qual->s[qual->l] = 0; } } - dump_fastq(opts, sam_hdr_tid2name(opts->h, opts->last_tid), + dump_fastq(opts, sam_hdr_tid2name(opts->h, c->last_tid), seq->s, seq->l, qual->s, qual->l); } + if (update_ref(c, tid) < 0) + return -1; seq->l = 0; qual->l = 0; - if (!opts->iter && opts->all_bases > 1 && ++opts->last_tid < tid) { - opts->last_pos = 0; + if (!c->iter && opts->all_bases > 1 && ++c->last_tid < tid) { + c->last_pos = 0; goto next_ref; } - opts->last_tid = tid; - if (opts->iter) - opts->last_pos = opts->iter->beg; + c->last_tid = tid; + if (c->iter) + c->last_pos = opts->all_bases ? c->iter->beg : pos-1; else - opts->last_pos = opts->all_bases ? 0 : pos-1; + c->last_pos = opts->all_bases ? 0 : pos-1; } - // share this with basic_pileup - if (opts->mode != MODE_SIMPLE) { - consensus_t cons; - calculate_consensus_gap5m(pos, opts->use_mqual ? CONS_MQUAL : 0, - depth, p, opts, &cons, opts->default_qual, - &cons_prob_recall, &cons_prob_precise); - if (cons.depth < opts->min_depth) { - cb = 'N'; - cq = 0; - } else if (cons.het_logodd > 0 && opts->ambig) { - cb = "AMRWa" // 5x5 matrix with ACGT* per row / col - "MCSYc" - "RSGKg" - "WYKTt" - "acgt*"[cons.het_call]; - cq = cons.het_logodd; - } else { - cb = "ACGT*"[cons.call]; - cq = cons.phred; - } - if (cq < opts->cons_cutoff && cb != '*' && - cons.het_call % 5 != 4 && cons.het_call / 5 != 4) { - // het base/* keeps base or * as most likely pure call, else N. - // This is because we don't have a traditional way of representing - // base or not-base ambiguity. - cb = 'N'; - cq = 0; - } - } else { - cb = calculate_consensus_simple(p, opts, &cq); - } - if (cb < 0) + if (consensus_base(opts, p, pos, depth, &cb, &cq) < 0) return -1; if (!p) return 0; if (!opts->show_del && cb == '*') { - opts->last_pos = pos; - opts->last_tid = tid; + c->last_pos = pos; + c->last_tid = tid; return 0; } + if (opts->mark_ins && nth && cb != '*') { kputc('_', seq); kputc('_', qual); } - // end of share - // Append consensus base/qual to seqs - if (pos > opts->last_pos) { - if (opts->last_pos >= 0 || opts->all_bases) { - // FIXME: don't expand qual if fasta - if (ks_expand(seq, pos - opts->last_pos) < 0 || - ks_expand(qual, pos - opts->last_pos) < 0) + if (pos > c->last_pos) { + if (c->last_pos > 0 || opts->all_bases) { + if (ks_expand(seq, pos - c->last_pos) < 0 || + (opts->fmt == FASTQ && + ks_expand(qual, pos - c->last_pos) < 0)) return -1; - memset(seq->s + seq->l, 'N', pos - (opts->last_pos+1)); - memset(qual->s + qual->l, '!', pos - (opts->last_pos+1)); - seq->l += pos - (opts->last_pos+1); - qual->l += pos - (opts->last_pos+1); + if (update_ref(c, tid) < 0) + return -1; + if (c->ref) { + // last bases of the previous reference + memcpy(seq->s + seq->l, c->ref + c->last_pos, + pos - (c->last_pos+1)); + if (opts->fmt == FASTQ) + memset(qual->s + qual->l, opts->ref_qual + '!', + pos - (c->last_pos+1)); + } else { + memset(seq->s + seq->l, 'N', pos - (c->last_pos+1)); + if (opts->fmt == FASTQ) + memset(qual->s + qual->l, '!', pos - (c->last_pos+1)); + } + seq->l += pos - (c->last_pos+1); + qual->l += pos - (c->last_pos+1); } } if ((nth && opts->show_ins && cb != '*') - || cb != '*' || (pos > opts->last_pos && opts->show_del)) { + || cb != '*' || (pos > c->last_pos && opts->show_del)) { + if (c->ks_ins_start == -1) + c->ks_ins_start = seq->l; int err = 0; err |= kputc(cb, seq) < 0; err |= kputc(MIN(cq, '~'-'!')+'!', qual) < 0; @@ -2296,12 +2448,466 @@ static int basic_fasta(void *cd, samFile *fp, sam_hdr_t *h, pileup_t *p, return -1; } - opts->last_pos = pos; - opts->last_tid = tid; + c->last_pos = pos; + c->last_tid = tid; + + return 0; +} + +/* + * Computes pileup or fasta/q consensus for a given region. This is executed + * within a worker thread. + */ +void *pileup_job(void *data) { + // A local copy of consensus_opts, per pileup context + ctx *c = (ctx *)data; + consensus_opts *opts = c->opts; + + thread_data_t *tdata = thread_data(opts); + samFile *fp = tdata->fp; + + // Do the pileup job on our local iterator region + c->iter = sam_itr_queryi(tdata->idx, c->tid, c->start, c->end); + pileup_loop(fp, c->h, readaln2, + opts->mode != MODE_SIMPLE ? nm_init : NULL, + c->seq_column, nm_free, c); + + if (opts->fmt == PILEUP && c->last_pos < c->end && opts->all_bases) { + hts_pos_t beg = MAX(c->iter ? c->iter->beg : 0, c->last_pos); + empty_pileup2(c, opts->h, c->tid, 1, beg, c->end); + } + + sam_itr_destroy(c->iter); + + return c; +} + + +// Copy the reference to fastq if known, or fill out Ns. +// Returns 0 on success, -1 on failure +int ref_or_Ns(ctx *c, kstring_t *seq, kstring_t *qual, + hts_pos_t pos, hts_pos_t Nlen) { + consensus_opts *opts = c->opts; + + if (ks_resize(seq, seq->l + Nlen+1) < 0) + return -1; + if (opts->fmt == FASTQ) + if (ks_resize(qual, qual->l + Nlen+1) < 0) + return -1; + + if (opts->ref_fn) { + hts_pos_t rlen; + if ((rlen = update_ref(c, c->tid)) < 0) + return -1; + memcpy(seq->s + seq->l, &c->ref[pos], Nlen); + seq->s[seq->l += Nlen] = 0; + if (opts->fmt == FASTQ) { + memset(qual->s + qual->l, opts->ref_qual + '!', Nlen); + qual->s[qual->l += Nlen] = 0; + } + } else { + memset(seq->s + seq->l, 'N', Nlen); + seq->s[seq->l += Nlen] = 0; + if (opts->fmt == FASTQ) { + memset(qual->s + qual->l, '!', Nlen); + qual->s[qual->l += Nlen] = 0; + } + } return 0; } +// Append a chunk of sequence data in seq/qual to ks. +// If we're in the middle of the requestion range then we need to pad +// any starting and ending locations with Ns. Otherwise we can omit them +// iff the -a option wasn't specified. +int append_cons(ctx *c, kstring_t *seq, kstring_t *qual, + hts_pos_t *used_start, hts_pos_t *used_end) { + consensus_opts *opts = c->opts; + + // Our block returned is based on first..last covered within that block. + // With -a we expand that to start..end of region. Similarly if the block + // is internal (not the first or last chunk) we also have to expand. + if (opts->all_bases || *used_start >= 0) { + hts_pos_t Nlen = c->ks_ins_start == -1 + ? c->end - c->start // entire block is missing + : c->first_pos - c->start - 1; // first covered base + if (Nlen) { + if (ks_resize(seq, seq->l + Nlen+1) < 0) + return -1; + if (ref_or_Ns(c, seq, qual, c->start, Nlen) < 0) + return -1; + + if (*used_start == -1) + *used_start = 0; // we now count the Ns. + if (opts->all_bases) + *used_end = seq->l; + } + } + + // Seq may start with Ns and we may wish to trim them if we've not emitted + // any sequence yet. This is dealt with already in *used_start >= 0 above, + // so we don't want to add twice. + char *ks_seq = c->ks_ins_seq.s, *ks_qual = c->ks_ins_qual.s; + int64_t ks_seq_l = c->ks_ins_seq.l, ks_qual_l = c->ks_ins_qual.l; + if (c->ks_ins_start >= 0) { + ks_seq += c->ks_ins_start; + ks_qual += c->ks_ins_start; + ks_seq_l -= c->ks_ins_start; + ks_qual_l -= c->ks_ins_start; + } + + // The real sequence + if (ks_seq_l) { + kputsn(ks_seq, ks_seq_l, seq); + kputsn(ks_qual, ks_qual_l, qual); + + // Any post Ns + int Nlen = c->end - c->last_pos; + *used_end = seq->l; + *used_start = 0; + if (Nlen) { + if (ref_or_Ns(c, seq, qual, c->last_pos, Nlen) < 0) + return -1; + + if (opts->all_bases) { + *used_end = seq->l; + } + } + } + + return 0; +} + +// Parallel consensus generation. +// +// Executes in the main thread but despatches jobs to the worker thread to +// do the bulk of the pileup/fastq creation. +int pileup_loop_parallel(consensus_opts *opts) { + int chr = 0, err = -1; + hts_pos_t start; + hts_pos_t end; + hts_tpool *pool = NULL; + hts_tpool_process *q = NULL; + + hts_pos_t used_start, used_end; + + //printf("Reg %s:%ld-%ld\n", opts->h->target_name[chr], start, end); + int counter = 0, received = 0; + + thread_data_t *tdata = opts->tdata; + for (int i = 1; i <= opts->nthreads; i++) { + tdata[i].fp = sam_open_format(opts->fn, "r", + (htsFormat *)&opts->ga_in); + if (!tdata[i].fp) + goto err; + if (opts->ref_fn) { + if (!(tdata[i].fai = fai_load(opts->ref_fn))) + goto err; + } + if (tdata[i].fp->format.format == cram) { + // For CRAM, as indices are tied to a file descriptor + tdata[i].idx = sam_index_load(tdata[i].fp, opts->fn); + } else { + tdata[i].idx = tdata[0].idx; + } + } + + pool = hts_tpool_init(opts->nthreads); + q = hts_tpool_process_init(pool, opts->nthreads*2, 0); + hts_tpool_result *r; + opts->pool = pool; + + do { + // next chromosome + used_start = -1; + used_end = 0; + + if (opts->reg) { + sam_parse_region(opts->h, opts->reg, &chr, &start, &end, 0); + if (start < 0) + start = 0; + if (end > opts->h->target_len[chr]) + end = opts->h->target_len[chr]; + } else { + for (; chr < sam_hdr_nref(opts->h); chr++) { + hts_itr_t *itr = sam_itr_queryi(tdata[0].idx, chr, 0, + HTS_POS_MAX); + int finished = itr->finished; + if (finished && opts->all_bases > 1) { + // empty chr + kstring_t seq = KS_INITIALIZE; + kstring_t qual = KS_INITIALIZE; + hts_pos_t rlen = sam_hdr_tid2len(opts->h, chr); + ctx c; + memset(&c, 0, sizeof(c)); + c.h = opts->h; + c.opts = opts; + c.tid = chr; + c.start = 0; + c.end = rlen; + c.ks_ins_start = -1; + + if (opts->fmt == PILEUP) { + if (empty_pileup2(&c, opts->h, chr, 0, 0, rlen) < 0) + return -1; + } else { + hts_pos_t used_start = -1, used_end = 0; + append_cons(&c, &seq, &qual, &used_start, &used_end); + dump_fastq(opts, sam_hdr_tid2name(opts->h, chr), + seq.s, rlen, qual.s, rlen); + } + ks_free(&seq); + ks_free(&qual); + } + sam_itr_destroy(itr); + if (!finished) + break; + } + if (chr == sam_hdr_nref(opts->h)) + goto ret_0; + + start = 0; + end = opts->h->target_len[chr]; + } + + hts_pos_t sub_start = start; + hts_pos_t sub_end = start + opts->span -1; + + ctx *c = NULL; + kstring_t seq = KS_INITIALIZE; + kstring_t qual = KS_INITIALIZE; + while (sub_start < end) { + if (!c) { + c = calloc(1, sizeof(*c)); + c->h = opts->h; + c->tid = chr; + c->start = sub_start; + c->end = sub_end = MIN(sub_start + opts->span, end); + c->counter = counter++; + c->opts = opts; // FIXME: we don't need to copy most of this + c->last_tid = -1; + c->last_pos = -1; + c->first_pos = HTS_POS_MAX; + c->ks_ins_start = -1; + c->seq_column = opts->fmt == PILEUP + ? basic_pileup + : basic_fasta; + c->first = c->start == start; + c->last = c->end == end; + } + + int blk = hts_tpool_dispatch2(pool, q, pileup_job, c, 1); + + // Check for results + while ((r = hts_tpool_next_result(q))) { + ctx *c = (ctx *)hts_tpool_result_data(r); + if (opts->fmt == PILEUP) { + kstring_t *ks = &c->ks_pileup; + if (fwrite(ks->s, 1, ks->l, opts->fp_out) != ks->l) + goto err; + ks_free(ks); + } else { + append_cons(c, &seq, &qual, &used_start, &used_end); + ks_free(&c->ks_ins_seq); + ks_free(&c->ks_ins_qual); + } + hts_tpool_delete_result(r, 1); + received++; + } + + if (blk == -1) { + struct timespec req = { 0, 1000000 }; + nanosleep(&req, NULL); + } else { + c = NULL; + sub_start += opts->span; + sub_end += opts->span; + } + } + + while (received < counter) { + while (!(r = hts_tpool_next_result(q))) { + struct timespec req = { 0, 1000000 }; + nanosleep(&req, NULL); + } + ctx *c = (ctx *)hts_tpool_result_data(r); + if (opts->fmt == PILEUP) { + kstring_t *ks = &c->ks_pileup; + if (ks->l && fwrite(ks->s, 1, ks->l, opts->fp_out) != ks->l) + goto err; + ks_free(ks); + } else { + append_cons(c, &seq, &qual, &used_start, &used_end); + ks_free(&c->ks_ins_seq); + ks_free(&c->ks_ins_qual); + } + hts_tpool_delete_result(r, 1); + received++; + } + + if (opts->fmt != PILEUP) + dump_fastq(opts, sam_hdr_tid2name(opts->h, chr), + seq.s, used_end, qual.s, used_end); + + ks_free(&seq); + ks_free(&qual); + + } while (!opts->reg && ++chr < sam_hdr_nref(opts->h)); + + ret_0: + err = 0; + err: + + // Discard any inflight jobs. Can this happen? Perhaps on error. + while (received < counter) { + while (!(r = hts_tpool_next_result(q))) { + struct timespec req = { 0, 1000000 }; + nanosleep(&req, NULL); + } + ctx *c = (ctx *)hts_tpool_result_data(r); + ks_free(&c->ks_pileup); + ks_free(&c->ks_ins_seq); + ks_free(&c->ks_ins_qual); + hts_tpool_delete_result(r, 1); + + received++; + } + + for (int i = 1; i <= opts->nthreads; i++) { + if (tdata[i].idx && tdata[i].idx != tdata[0].idx) + hts_idx_destroy(tdata[i].idx); + + err |= sam_close(tdata[i].fp)<0; + fai_destroy(tdata[i].fai); + free(tdata[i].ref); + } + + if (q) + hts_tpool_process_destroy(q); + if (pool) + hts_tpool_destroy(pool); + + return err; +} + +/* + * Non-threaded implementation. + * Returns 0 on success + * -1 on failure + */ +int pileup_loop_serial(consensus_opts *opts) { + int ret = -1; + thread_data_t *tdata = &opts->tdata[0]; + + // Serial mode uses a single job rather than breaking the task down into + // regions, but we still need to create a job context for it. + ctx c = { + .ks_pileup = {0,0}, + .ks_ins_seq = {0,0}, + .ks_ins_qual = {0,0}, + .opts = opts, + .last_tid = -1, + .last_pos = -1, + .ref_tid = -1, + .iter = NULL + }; + + if (opts->reg) { + c.iter = sam_itr_querys(opts->tdata[0].idx, opts->h, opts->reg); + if (!c.iter) { + print_error("consensus", "Failed to parse region \"%s\"", + opts->reg); + goto err; + } + } + + if (opts->fmt == PILEUP) { + if (pileup_loop(tdata->fp, opts->h, readaln2, + opts->mode != MODE_SIMPLE ? nm_init : NULL, + basic_pileup, + opts->mode != MODE_SIMPLE ? nm_free : NULL, + &c) < 0) + goto err; + + if (opts->all_bases) { + int tid = c.iter ? c.iter->tid : c.last_tid; + int len = sam_hdr_tid2len(opts->h, tid); + int pos = c.last_pos; + if (c.iter) { + len = MIN(c.iter->end, len); + pos = MAX(c.iter->beg, pos); + } + if (empty_pileup2(&c, opts->h, tid, 0, pos, len) < 0) + goto err; + } + while (!c.iter && opts->all_bases > 1 && + ++c.last_tid < opts->h->n_targets) { + int len = sam_hdr_tid2len(opts->h, c.last_tid); + if (empty_pileup2(&c, opts->h, c.last_tid, 0, 0, len) < 0) + goto err; + } + } else { + if (pileup_loop(tdata->fp, opts->h, readaln2, + opts->mode != MODE_SIMPLE ? nm_init : NULL, + basic_fasta, + opts->mode != MODE_SIMPLE ? nm_free : NULL, + &c) < 0) + goto err; + + next_ref_q: + if (opts->all_bases) { + // fill out terminator + int tid = c.iter ? c.iter->tid : c.last_tid; + int len = sam_hdr_tid2len(opts->h, tid); + int pos = c.last_pos; + if (c.iter) { + len = MIN(c.iter->end, len); + pos = MAX(c.iter->beg, pos); + c.last_tid = c.iter->tid; + } + if (pos < len) { + if (update_ref(&c, c.last_tid) < 0) + goto err; + if (ks_expand(&c.ks_ins_seq, len-pos+1) < 0) + goto err; + if (ks_expand(&c.ks_ins_qual, len-pos+1) < 0) + goto err; + while (pos++ < len) { + c.ks_ins_seq.s [c.ks_ins_seq.l++] = + c.ref ? c.ref[pos-1] : 'N'; + c.ks_ins_qual.s[c.ks_ins_qual.l++] = + (c.ref ? opts->ref_qual : 0) + '!'; + } + c.ks_ins_seq.s [c.ks_ins_seq.l] = 0; + c.ks_ins_qual.s[c.ks_ins_qual.l] = 0; + } + } + if (c.last_tid >= 0) + dump_fastq(opts, sam_hdr_tid2name(opts->h, c.last_tid), + c.ks_ins_seq.s, c.ks_ins_seq.l, + c.ks_ins_qual.s, c.ks_ins_qual.l); + + if (!c.iter && opts->all_bases > 1 && + ++c.last_tid < opts->h->n_targets) { + c.last_pos = 0; + c.ks_ins_seq.l = c.ks_ins_qual.l = 0; + goto next_ref_q; + } + } + + ret = 0; + err: + + ks_free(&c.ks_pileup); + ks_free(&c.ks_ins_seq); + ks_free(&c.ks_ins_qual); + if (c.iter) + hts_itr_destroy(c.iter); + + return ret; +} + // END OF NEW PILEUP //--------------------------------------------------------------------------- @@ -2326,6 +2932,8 @@ static void usage_exit(FILE *fp, int exit_status) { fprintf(fp, " --mark-ins Add '+' before every inserted base/qual [off]\n"); fprintf(fp, " -A, --ambig Enable IUPAC ambiguity codes [off]\n"); fprintf(fp, " -d, --min-depth INT Minimum depth of INT [1]\n"); + fprintf(fp, " -Z, --block-size INT Size of chromosome block (bp) when threading [100000]\n"); + fprintf(fp, " --ref-qual INT QUAL to use for reference bases [0]\n"); fprintf(fp, "\nFor simple consensus mode:\n"); fprintf(fp, " -q, --(no-)use-qual Use quality values in calculation [off]\n"); fprintf(fp, " -c, --call-fract INT At least INT portion of bases must agree [0.75]\n"); @@ -2355,7 +2963,16 @@ static void usage_exit(FILE *fp, int exit_status) { fprintf(fp, " hiseq, hifi, r10.4_sup, r10.4_dup and ultima\n"); fprintf(fp, "\nGlobal options:\n"); - sam_global_opt_help(fp, "-.---@-."); + // Edited sam_global_opt_help(fp, "-.---@-.") help to expand -@ description. + fprintf(fp, " --input-fmt-option OPT[=VAL]\n"); + fprintf(fp, " Specify a single input file format option in the form\n"); + fprintf(fp, " of OPTION or OPTION=VALUE\n"); + fprintf(fp, " -T, --reference FILE\n"); + fprintf(fp, " Reference sequence FASTA FILE [null]\n"); + fprintf(fp, " -@, --threads INT\n"); + fprintf(fp, " Number of additional decompression threads to use [0]\n"); + fprintf(fp, " --verbosity INT\n"); + fprintf(fp, " Set level of verbosity\n"); exit(exit_status); } @@ -2396,24 +3013,18 @@ int main_consensus(int argc, char **argv) { .het_scale = P_HET_SCALE, .homopoly_fix = 0, .homopoly_redux = 0.01, + .ref_qual = 0, + .span = 500000, // Internal state - .ks_line = {0,0}, - .ks_ins_seq = {0,0}, - .ks_ins_qual = {0,0}, - .fp = NULL, .fp_out = stdout, - .iter = NULL, - .idx = NULL, - .last_tid = -1, - .last_pos = -1, + .ga_in = SAM_GLOBAL_ARGS_INIT }; set_qcal(&opts.qcal, QCAL_FLAT); - sam_global_args ga = SAM_GLOBAL_ARGS_INIT; static const struct option lopts[] = { - SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', '-', '-', '@'), + SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', '-', 'T', '@'), {"use-qual", no_argument, NULL, 'q'}, {"no-use-qual", no_argument, NULL, 'q'+1000}, {"adj-qual", no_argument, NULL, 'q'+100}, @@ -2456,10 +3067,12 @@ int main_consensus(int argc, char **argv) { {"homopoly-redux", required_argument, NULL, 'p'+200}, {"qual-calibration", required_argument, NULL, 't'}, {"config", required_argument, NULL, 'X'}, + {"ref-qual", required_argument, NULL, 20}, + {"block-size", required_argument, NULL, 'Z'}, {NULL, 0, NULL, 0} }; - while ((c = getopt_long(argc, argv, "@:qd:c:H:r:5f:C:aAl:o:m:pt:X:", + while ((c = getopt_long(argc, argv, "@:qd:c:H:r:5f:C:aAl:o:m:pt:X:T:Z:", lopts, NULL)) >= 0) { switch (c) { case 'a': opts.all_bases++; break; @@ -2497,6 +3110,11 @@ int main_consensus(int argc, char **argv) { case 'm'+101: opts.nm_adjust = 0; break; case 'h'+100: opts.nm_halo = atoi(optarg); break; case 'h'+101: opts.sc_cost = atoi(optarg); break; + case 'Z': + opts.span = atoi(optarg); + if (opts.span < 2) + opts.span = 2; + break; case 'm': // mode if (strcasecmp(optarg, "simple") == 0) { @@ -2534,6 +3152,9 @@ int main_consensus(int argc, char **argv) { opts.fmt = FASTQ; } else if (strcasecmp(optarg, "pileup") == 0) { opts.fmt = PILEUP; + // Pileup uses much more memory so reduce default span size + if (opts.span == 500000) + opts.span = 100000; } else { fprintf(stderr, "Unknown format %s\n", optarg); return 1; @@ -2626,11 +3247,21 @@ int main_consensus(int argc, char **argv) { print_error("consensus", "failed to load quality calibration '%s'", optarg); - return -1; + return 1; } break; - default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break; + case 'T': // --reference + opts.ref_fn = optarg; + break; + + case 20: + opts.ref_qual = atoi(optarg); + break; + + default: + if (parse_sam_global_opt(c, optarg, lopts, &opts.ga_in) == 0) + break; /* else fall-through */ case '?': usage_exit(stderr, EXIT_FAILURE); @@ -2679,124 +3310,68 @@ int main_consensus(int argc, char **argv) { if (argc == optind) usage_exit(stdout, EXIT_SUCCESS); else usage_exit(stderr, EXIT_FAILURE); } - opts.fp = sam_open_format(argv[optind], "r", &ga.in); - if (opts.fp == NULL) { + + opts.nthreads = opts.ga_in.nthreads; + opts.tdata = calloc(opts.nthreads+1, sizeof(*opts.tdata)); + opts.tdata[0].fp = sam_open_format(argv[optind], "r", + (htsFormat *)&opts.ga_in); + opts.tdata[0].ref_tid = -1; + opts.fn = argv[optind]; + if (opts.tdata[0].fp == NULL) { print_error_errno("consensus", "Cannot open input file \"%s\"", argv[optind]); goto err; } - if (ga.nthreads > 0) - hts_set_threads(opts.fp, ga.nthreads); - if (hts_set_opt(opts.fp, CRAM_OPT_DECODE_MD, 0)) { - fprintf(stderr, "Failed to set CRAM_OPT_DECODE_MD value\n"); - goto err; + if (opts.ref_fn) { + if (!(opts.tdata[0].fai = fai_load(opts.ref_fn))) { + fprintf(stderr, "Failed to load fai for %s\n", optarg); + return 1; + } } - if (!(opts.h = sam_hdr_read(opts.fp))) { + + if (!(opts.h = sam_hdr_read(opts.tdata[0].fp))) { fprintf(stderr, "Failed to read header for \"%s\"\n", argv[optind]); goto err; } if (opts.reg) { - opts.idx = sam_index_load(opts.fp, argv[optind]); - if (!opts.idx) { + opts.tdata[0].idx = sam_index_load(opts.tdata[0].fp, argv[optind]); + if (!opts.tdata[0].idx) { print_error("consensus", "Cannot load index for input file \"%s\"", argv[optind]); goto err; } - opts.iter = sam_itr_querys(opts.idx, opts.h, opts.reg); - if (!opts.iter) { - print_error("consensus", "Failed to parse region \"%s\"", - opts.reg); - goto err; + } else if (opts.nthreads) { + // This is acceptable to fail. It just means threads are decompression + opts.tdata[0].idx = sam_index_load(opts.tdata[0].fp, argv[optind]); + if (!opts.tdata[0].idx) { + fprintf(stderr, "No index: multi-threading is limited to decompression only\n"); + // always consider doing this? + hts_set_threads(opts.tdata[0].fp, opts.nthreads); } } - if (opts.fmt == PILEUP) { - if (pileup_loop(opts.fp, opts.h, readaln2, - opts.mode != MODE_SIMPLE ? nm_init : NULL, - basic_pileup, - opts.mode != MODE_SIMPLE ? nm_free : NULL, - &opts) < 0) + if (opts.nthreads && opts.tdata[0].idx) { + if (pileup_loop_parallel(&opts) < 0) goto err; - - if (opts.all_bases) { - int tid = opts.iter ? opts.iter->tid : opts.last_tid; - int len = sam_hdr_tid2len(opts.h, tid); - int pos = opts.last_pos; - if (opts.iter) { - len = MIN(opts.iter->end, len); - pos = MAX(opts.iter->beg, pos); - } - if (empty_pileup2(&opts, opts.h, tid, pos, len) < 0) - goto err; - } - while (!opts.iter && opts.all_bases > 1 && - ++opts.last_tid < opts.h->n_targets) { - int len = sam_hdr_tid2len(opts.h, opts.last_tid); - if (empty_pileup2(&opts, opts.h, opts.last_tid, 0, len) < 0) - goto err; - } - } else { - if (pileup_loop(opts.fp, opts.h, readaln2, - opts.mode != MODE_SIMPLE ? nm_init : NULL, - basic_fasta, - opts.mode != MODE_SIMPLE ? nm_free : NULL, - &opts) < 0) + if (pileup_loop_serial(&opts) < 0) goto err; - - next_ref_q: - if (opts.all_bases) { - // fill out terminator - int tid = opts.iter ? opts.iter->tid : opts.last_tid; - int len = sam_hdr_tid2len(opts.h, tid); - int pos = opts.last_pos; - if (opts.iter) { - len = MIN(opts.iter->end, len); - pos = MAX(opts.iter->beg, pos); - opts.last_tid = opts.iter->tid; - } - if (pos < len) { - if (ks_expand(&opts.ks_ins_seq, len-pos+1) < 0) - goto err; - if (ks_expand(&opts.ks_ins_qual, len-pos+1) < 0) - goto err; - while (pos++ < len) { - opts.ks_ins_seq.s [opts.ks_ins_seq.l++] = 'N'; - opts.ks_ins_qual.s[opts.ks_ins_qual.l++] = '!'; - } - opts.ks_ins_seq.s [opts.ks_ins_seq.l] = 0; - opts.ks_ins_qual.s[opts.ks_ins_qual.l] = 0; - } - } - if (opts.last_tid >= 0) - dump_fastq(&opts, sam_hdr_tid2name(opts.h, opts.last_tid), - opts.ks_ins_seq.s, opts.ks_ins_seq.l, - opts.ks_ins_qual.s, opts.ks_ins_qual.l); - - if (!opts.iter && opts.all_bases > 1 && - ++opts.last_tid < opts.h->n_targets) { - opts.last_pos = 0; - opts.ks_ins_seq.l = opts.ks_ins_qual.l = 0; - goto next_ref_q; - } -// if (consensus_loop(&opts) < 0) { -// print_error_errno("consensus", "Failed"); -// goto err; -// } } ret = 0; err: - if (opts.iter) - hts_itr_destroy(opts.iter); - if (opts.idx) - hts_idx_destroy(opts.idx); + if (opts.tdata[0].fai) + fai_destroy(opts.tdata[0].fai); + free(opts.tdata[0].ref); - if (opts.fp && sam_close(opts.fp) < 0) { + if (opts.tdata[0].idx) + hts_idx_destroy(opts.tdata[0].idx); + + if (opts.tdata[0].fp && sam_close(opts.tdata[0].fp) < 0) { print_error_errno("consensus", "Closing input file \"%s\"", argv[optind]); ret = 1; @@ -2804,19 +3379,17 @@ int main_consensus(int argc, char **argv) { if (opts.h) sam_hdr_destroy(opts.h); - sam_global_args_free(&ga); + sam_global_args_free(&opts.ga_in); if (opts.fp_out && opts.fp_out != stdout) ret |= fclose(opts.fp_out) != 0; else ret |= fflush(stdout) != 0; - ks_free(&opts.ks_line); - ks_free(&opts.ks_ins_seq); - ks_free(&opts.ks_ins_qual); - if (ret) print_error("consensus", "failed"); + free(opts.tdata); + return ret; } diff --git a/samtools/bam_consensus.c.pysam.c b/samtools/bam_consensus.c.pysam.c index 9c73233d..5775a85d 100644 --- a/samtools/bam_consensus.c.pysam.c +++ b/samtools/bam_consensus.c.pysam.c @@ -3,7 +3,7 @@ /* bam_consensus.c -- consensus subcommand. Copyright (C) 1998-2001,2003 Medical Research Council (Gap4/5 source) - Copyright (C) 2003-2005,2007-2024 Genome Research Ltd. + Copyright (C) 2003-2005,2007-2025 Genome Research Ltd. Author: James Bonfield @@ -134,9 +134,14 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include +#include #include #include +#include +#include +#include #include "samtools.h" #include "sam_opts.h" @@ -190,6 +195,20 @@ typedef struct { int omap[101]; // overcall or INS } qcal_t; +// Persistent data used per thread and reused in each job operating in +// the same thread. This is for things like file handles where we +// can't share the same pointer due to read buffers. We also have one +// of these for the main thread for consistency in non-threaded +// operation. Index 0 is main, 1-N is threads. +typedef struct { + samFile *fp; // BAM files e.g. + hts_idx_t *idx; // BAI index e.g. + faidx_t *fai; // FAI index + int ref_tid; // reference chr value + char *ref; // reference sequence + hts_pos_t ref_len; // ref seq length +} thread_data_t; + typedef struct { // User options char *reg; @@ -226,19 +245,57 @@ typedef struct { double homopoly_fix; double homopoly_redux; qcal_t qcal; + char *ref_fn; + int ref_qual; + int span; // base block size for threads - // Internal state - samFile *fp; + // Internal state, shared between threads + char *fn; FILE *fp_out; + sam_global_args ga_in; sam_hdr_t *h; - hts_idx_t *idx; - hts_itr_t *iter; - kstring_t ks_line; + int nthreads; + hts_tpool *pool; + thread_data_t *tdata; // one per thread + 1 for main +} consensus_opts; + +// Thread specific state (or once only in main if not threaded) +typedef struct { + // Both threaded and non-threaded + consensus_opts *opts; // cached copy; FIXME + kstring_t ks_pileup; kstring_t ks_ins_seq; kstring_t ks_ins_qual; + hts_pos_t ks_ins_start; // first real base, buffer index + hts_pos_t first_pos, last_pos; // genomic coords of first/last base int last_tid; - hts_pos_t last_pos; -} consensus_opts; + char *ref; + hts_pos_t ref_len; + int ref_tid; + hts_itr_t *iter; + + // Threaded only + int counter; // block number + samFile *fp; // thread specific file pointer + sam_hdr_t *h; + pthread_t pid; + int tid; + hts_pos_t start, end; // region to process + int first, last; // region is at start of contig or end of contig + int (*seq_column)(void *client_data, + samFile *fp, + sam_hdr_t *h, + pileup_t *p, + int depth, + hts_pos_t pos, + int nth, + int is_insert); +} ctx; + +// Returns the thread_data struct associated with this thread ID +static thread_data_t *thread_data(consensus_opts *opts) { + return &opts->tdata[hts_tpool_worker_id(opts->pool)+1]; +} /* -------------------------------------------------------------------------- * A bayesian consensus algorithm that analyses the data to work out @@ -360,7 +417,7 @@ static cons_probs cons_prob_recall, cons_prob_precise; * * The heterozygosity weight though is a per column calculation as we're * trying to model whether the column is pure or mixed. Hence this is done - * once via a prior and has no affect on the individual matrix cells. + * once via a prior and has no effect on the individual matrix cells. * * We have a generic indel probability, but it's a catch all for overcall, * undercall, alignment artifacts, homopolymer issues, etc. So we can set @@ -401,7 +458,8 @@ static qcal_t static_qcal[6] = { 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, - 90, 91, 92, 93, 94, 95, 96, 97, 98, 99}, + 90, 91, 92, 93, 94, 95, 96, 97, 98, 99 + }, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, @@ -411,7 +469,8 @@ static qcal_t static_qcal[6] = { 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, - 90, 91, 92, 93, 94, 95, 96, 97, 98, 99}, + 90, 91, 92, 93, 94, 95, 96, 97, 98, 99 + }, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, @@ -421,7 +480,8 @@ static qcal_t static_qcal[6] = { 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, - 90, 91, 92, 93, 94, 95, 96, 97, 98, 99} + 90, 91, 92, 93, 94, 95, 96, 97, 98, 99 + } }, { // HiFi @@ -434,29 +494,29 @@ static qcal_t static_qcal[6] = { 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, - 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, - }, + 44, 44, 44, 44, 44, 44, 44, 44, 44, 44 + }, { 4, 4, 4, 4, 5, 6, 6, 7, 8, 9, - 10, 11, 11, 12, 13, 14, 15, 15, 16, 17, - 18, 19, 19, 20, 20, 21, 22, 23, 23, 24, - 25, 25, 25, 26, 26, 26, 27, 27, 28, 28, - 28, 28, 27, 27, 27, 28, 28, 28, 28, 27, - 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, - 27, 27, 26, 26, 25, 26, 26, 27, 27, 27, - 26, 26, 26, 26, 26, 26, 26, 26, 27, 27, - 28, 29, 28, 28, 28, 27, 27, 27, 27, 27, - 27, 28, 28, 30, 30, 30, 30, 30, 30, 30, - }, + 10, 11, 11, 12, 13, 14, 15, 15, 16, 17, + 18, 19, 19, 20, 20, 21, 22, 23, 23, 24, + 25, 25, 25, 26, 26, 26, 27, 27, 28, 28, + 28, 28, 27, 27, 27, 28, 28, 28, 28, 27, + 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, + 27, 27, 26, 26, 25, 26, 26, 27, 27, 27, + 26, 26, 26, 26, 26, 26, 26, 26, 27, 27, + 28, 29, 28, 28, 28, 27, 27, 27, 27, 27, + 27, 28, 28, 30, 30, 30, 30, 30, 30, 30, + }, { 8, 8, 8, 8, 9, 10, 11, 12, 13, 14, - 15, 15, 16, 17, 18, 19, 19, 20, 20, 21, - 21, 22, 22, 23, 23, 23, 24, 24, 24, 25, - 25, 25, 25, 25, 25, 26, 26, 26, 26, 27, - 27, 27, 27, 27, 27, 28, 28, 28, 28, 28, - 29, 29, 29, 29, 29, 29, 30, 30, 30, 30, - 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, - 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, - 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, - 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, + 15, 15, 16, 17, 18, 19, 19, 20, 20, 21, + 21, 22, 22, 23, 23, 23, 24, 24, 24, 25, + 25, 25, 25, 25, 25, 26, 26, 26, 26, 27, + 27, 27, 27, 27, 27, 28, 28, 28, 28, 28, + 29, 29, 29, 29, 29, 29, 30, 30, 30, 30, + 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, + 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, + 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, + 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, } }, @@ -498,36 +558,36 @@ static qcal_t static_qcal[6] = { { // ONT R10.4 super { 0, 2, 2, 2, 3, 4, 4, 5, 6, 7, 7, 8, 9, 12, 13, 14, 15, 15, 16, 17, - 18, 19, 20, 22, 24, 25, 26, 27, 28, 29, - 30, 31, 33, 34, 36, 37, 38, 38, 39, 39, - 40, 40, 40, 40, 40, 40, 40, 41, 40, 40, - 41, 41, 40, 40, 40, 40, 41, 40, 40, 40, - 40, 41, 41, 40, 40, 41, 40, 40, 39, 41, - 40, 41, 40, 40, 41, 41, 41, 40, 40, 40, - 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, - 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, + 18, 19, 20, 22, 24, 25, 26, 27, 28, 29, + 30, 31, 33, 34, 36, 37, 38, 38, 39, 39, + 40, 40, 40, 40, 40, 40, 40, 41, 40, 40, + 41, 41, 40, 40, 40, 40, 41, 40, 40, 40, + 40, 41, 41, 40, 40, 41, 40, 40, 39, 41, + 40, 41, 40, 40, 41, 41, 41, 40, 40, 40, + 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, + 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, }, { 0, 2, 2, 2, 3, 4, 5, 6, 7, 8, 8, 9, 9, 10, 10, 10, 11, 12, 12, 13, - 13, 13, 14, 14, 15, 16, 16, 17, 18, 18, - 19, 19, 20, 21, 22, 23, 24, 25, 25, 25, - 25, 25, 25, 25, 25, 25, 26, 26, 26, 26, - 26, 26, 26, 26, 27, 27, 27, 27, 27, 27, - 27, 27, 27, 27, 27, 27, 27, 28, 28, 28, - 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, - 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, - 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, + 13, 13, 14, 14, 15, 16, 16, 17, 18, 18, + 19, 19, 20, 21, 22, 23, 24, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 26, 26, 26, 26, + 26, 26, 26, 26, 27, 27, 27, 27, 27, 27, + 27, 27, 27, 27, 27, 27, 27, 28, 28, 28, + 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, + 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, + 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, }, { 0, 4, 6, 6, 6, 7, 7, 8, 9, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, - 15, 15, 15, 16, 16, 17, 17, 18, 18, 19, - 19, 20, 20, 21, 22, 22, 23, 23, 24, 24, - 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, - 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, - 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, - 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, - 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, - 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 15, 15, 15, 16, 16, 17, 17, 18, 18, 19, + 19, 20, 20, 21, 22, 22, 23, 23, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, } }, { // ONT R10.4 duplex; just a copy of hifi for now @@ -541,28 +601,28 @@ static qcal_t static_qcal[6] = { 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, - }, + }, { 4, 4, 4, 4, 5, 6, 6, 7, 8, 9, - 10, 11, 11, 12, 13, 14, 15, 15, 16, 17, - 18, 19, 19, 20, 20, 21, 22, 23, 23, 24, - 25, 25, 25, 26, 26, 26, 27, 27, 28, 28, - 28, 28, 27, 27, 27, 28, 28, 28, 28, 27, - 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, - 27, 27, 26, 26, 25, 26, 26, 27, 27, 27, - 26, 26, 26, 26, 26, 26, 26, 26, 27, 27, - 28, 29, 28, 28, 28, 27, 27, 27, 27, 27, - 27, 28, 28, 30, 30, 30, 30, 30, 30, 30, - }, + 10, 11, 11, 12, 13, 14, 15, 15, 16, 17, + 18, 19, 19, 20, 20, 21, 22, 23, 23, 24, + 25, 25, 25, 26, 26, 26, 27, 27, 28, 28, + 28, 28, 27, 27, 27, 28, 28, 28, 28, 27, + 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, + 27, 27, 26, 26, 25, 26, 26, 27, 27, 27, + 26, 26, 26, 26, 26, 26, 26, 26, 27, 27, + 28, 29, 28, 28, 28, 27, 27, 27, 27, 27, + 27, 28, 28, 30, 30, 30, 30, 30, 30, 30, + }, { 8, 8, 8, 8, 9, 10, 11, 12, 13, 14, - 15, 15, 16, 17, 18, 19, 19, 20, 20, 21, - 21, 22, 22, 23, 23, 23, 24, 24, 24, 25, - 25, 25, 25, 25, 25, 26, 26, 26, 26, 27, - 27, 27, 27, 27, 27, 28, 28, 28, 28, 28, - 29, 29, 29, 29, 29, 29, 30, 30, 30, 30, - 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, - 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, - 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, - 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, + 15, 15, 16, 17, 18, 19, 19, 20, 20, 21, + 21, 22, 22, 23, 23, 23, 24, 24, 24, 25, + 25, 25, 25, 25, 25, 26, 26, 26, 26, 27, + 27, 27, 27, 27, 27, 28, 28, 28, 28, 28, + 29, 29, 29, 29, 29, 29, 30, 30, 30, 30, + 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, + 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, + 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, + 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, } }, { // Ultima Genomics @@ -949,7 +1009,7 @@ int poly_len(const pileup_t *p, const bam1_t *b, hts_pos_t pos) { * Returns 0 (discard) or 1 (keep) on success, -1 on failure. */ int nm_init(void *client_data, samFile *fp, sam_hdr_t *h, pileup_t *p) { - consensus_opts *opts = (consensus_opts *)client_data; + consensus_opts *opts = ((ctx *)client_data)->opts; if (!opts->use_mqual) return 1; @@ -1191,14 +1251,14 @@ double lnbinprobhalf(int n, double k) { } #endif +#include "bam_consensus_tab.h" + static int calculate_consensus_gap5(hts_pos_t pos, int flags, int depth, pileup_t *plp, consensus_opts *opts, consensus_t *cons, int default_qual, cons_probs *cp) { - int i, j; - static int init_done =0; - static double q2p[101], mqual_pow[256]; + int j; double min_e_exp = DBL_MIN_EXP * log(2) + 1; double S[15] ALIGNED(16) = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; @@ -1228,23 +1288,6 @@ int calculate_consensus_gap5(hts_pos_t pos, int flags, int depth, 18, 19, 24}; - if (!init_done) { - init_done = 1; - - for (i = 0; i <= 100; i++) { - q2p[i] = pow(10, -i/10.0); - } - - for (i = 0; i < 255; i++) { - //mqual_pow[i] = 1-pow(10, -(i+.01)/10.0); - mqual_pow[i] = 1-pow(10, -(i*.9)/10.0); - //mqual_pow[i] = 1-pow(10, -(i/3+.1)/10.0); - //mqual_pow[i] = 1-pow(10, -(i/2+.05)/10.0); - } - // unknown mqual - mqual_pow[255] = mqual_pow[10]; - } - /* Initialise */ int counts[6] = {0}; #ifdef DO_FRACT @@ -1353,9 +1396,16 @@ int calculate_consensus_gap5(hts_pos_t pos, int flags, int depth, if (mqual > opts->high_mqual) mqual = opts->high_mqual; - double _p = 1-q2p[qual]; - double _m = mqual_pow[mqual]; - qual = ph_log(1-(_m * _p + (1 - _m)/4)); // CURRENT +// double _p = 1-q2p[qual]; +// double _m = mqual_pow[mqual]; +// qual = ph_log(1-(_m * _p + (1 - _m)/4)); // CURRENT + + // Equivalent to the above, but avoiding numbers very close to 1 + // This is also marginally faster. + double _P = q2p[qual]; + double _M = mqual_pow_1m[mqual]; + qual = ph_log(_P+.75*_M-_P*_M); + //qual = ph_log(1-_p*_m); // testing //qual *= 6/sqrt(td); } @@ -1820,58 +1870,6 @@ int calculate_consensus_gap5m(hts_pos_t pos, int flags, int depth, return 0; } -/* -------------------------------------------------------------------------- - * Main processing logic - */ - -static void dump_fastq(consensus_opts *opts, - const char *name, - const char *seq, size_t seq_l, - const char *qual, size_t qual_l) { - enum format fmt = opts->fmt; - int line_len = opts->line_len; - FILE *fp = opts->fp_out; - - fprintf(fp, "%c%s\n", ">@"[fmt==FASTQ], name); - size_t i; - for (i = 0; i < seq_l; i += line_len) - fprintf(fp, "%.*s\n", (int)MIN(line_len, seq_l - i), seq+i); - - if (fmt == FASTQ) { - fprintf(fp, "+\n"); - for (i = 0; i < seq_l; i += line_len) - fprintf(fp, "%.*s\n", (int)MIN(line_len, seq_l - i), qual+i); - } -} - -//--------------------------------------------------------------------------- - -/* - * Reads a single alignment record, using either the iterator - * or a direct sam_read1 call. - */ -static int readaln2(void *dat, samFile *fp, sam_hdr_t *h, bam1_t *b) { - consensus_opts *opts = (consensus_opts *)dat; - - for (;;) { - int ret = opts->iter - ? sam_itr_next(fp, opts->iter, b) - : sam_read1(fp, h, b); - if (ret < 0) - return ret; - - // Apply hard filters - if (opts->incl_flags && !(b->core.flag & opts->incl_flags)) - continue; - if (opts->excl_flags && (b->core.flag & opts->excl_flags)) - continue; - if (b->core.qual < opts->min_mqual) - continue; - - return ret; - } -} - /* -------------------------------------------------------------------------- * A simple summing algorithm, either pure base frequency, or by * weighting them according to their quality values. @@ -2010,19 +2008,180 @@ static int calculate_consensus_simple(const pileup_t *plp, return het[used_base]; } -static int empty_pileup2(consensus_opts *opts, sam_hdr_t *h, int tid, + +/* -------------------------------------------------------------------------- + * Main processing logic + */ + +/* + * Ensure opts->ref is up to date. + * Returns >=0 on success (length) + * -1 on failure + */ +static hts_pos_t update_ref(ctx *c, int tid) { + consensus_opts *opts = c->opts; + if (!opts->ref_fn) + return 0; + + thread_data_t *tdata = thread_data(opts); + if (tid == tdata->ref_tid && tdata->ref) { + c->ref = tdata->ref; + return tdata->ref_len; + } + + free(tdata->ref); + tdata->ref = NULL; + tdata->ref_tid = tid; + + c->ref = NULL; + + const char *chr = sam_hdr_tid2name(opts->h, tid); + if (!chr) + return -1; + + if (!(tdata->ref = fai_fetch64(tdata->fai, chr, &tdata->ref_len))) + return -1; + c->ref = tdata->ref; + c->ref_tid = tid; + c->ref_len = tdata->ref_len; + + return c->ref_len; +} +// Outputs a FASTA or FASTQ consensus sequence +static void dump_fastq(consensus_opts *opts, + const char *name, + const char *seq, size_t seq_l, + const char *qual, size_t qual_l) { + enum format fmt = opts->fmt; + int line_len = opts->line_len; + FILE *fp = opts->fp_out; + + if (!seq_l) + return; + + fprintf(fp, "%c%s\n", ">@"[fmt==FASTQ], name); + size_t i; + for (i = 0; i < seq_l; i += line_len) + fprintf(fp, "%.*s\n", (int)MIN(line_len, seq_l - i), seq+i); + + if (fmt == FASTQ) { + fprintf(fp, "+\n"); + for (i = 0; i < seq_l; i += line_len) + fprintf(fp, "%.*s\n", (int)MIN(line_len, seq_l - i), qual+i); + } +} + +//--------------------------------------------------------------------------- + +/* + * Reads a single alignment record, using either the iterator + * or a direct sam_read1 call. This also applies the include/exclude filters. + */ +static int readaln2(void *dat, samFile *fp, sam_hdr_t *h, bam1_t *b) { + ctx *c = (ctx *)dat; + consensus_opts *opts = c->opts; + + for (;;) { + int ret = c->iter + ? sam_itr_next(fp, c->iter, b) + : sam_read1(fp, h, b); + if (ret < 0) + return ret; + + // Apply hard filters + if (opts->incl_flags && !(b->core.flag & opts->incl_flags)) + continue; + if (opts->excl_flags && (b->core.flag & opts->excl_flags)) + continue; + if (b->core.qual < opts->min_mqual) + continue; + + return ret; + } +} + +// Output/append a portion of empty pileup. This may be N/0 or ref/qual. +static int empty_pileup2(ctx *c, sam_hdr_t *h, int tid, int threaded, hts_pos_t start, hts_pos_t end) { + consensus_opts *opts = c->opts; const char *name = sam_hdr_tid2name(h, tid); hts_pos_t i; - int err = 0; - for (i = start; i < end; i++) - err |= fprintf(opts->fp_out, "%s\t%"PRIhts_pos"\t0\t0\tN\t0\t*\t*\n", name, i+1) < 0; + + char *rseq = NULL; + if (opts->ref_fn && (err |= (update_ref(c, tid) <= 0)) == 0) + rseq = c->ref; + + if (threaded) { + kstring_t *ks = &c->ks_pileup; + for (i = start; i < end; i++) + err |= ksprintf(ks, + "%s\t%"PRIhts_pos"\t0\t0\t%c\t0\t*\t*\n", + name, i+1, rseq ? rseq[i] : 'N') < 0; + } else { + for (i = start; i < end; i++) + err |= fprintf(opts->fp_out, + "%s\t%"PRIhts_pos"\t0\t0\t%c\t0\t*\t*\n", + name, i+1, rseq ? rseq[i] : 'N') < 0; + } return err ? -1 : 0; } /* + * Compute consensus for a specific base. Fills out base and qual. + * Returns 0 on success, + * -1 on failure. + */ +int consensus_base(consensus_opts *opts, + pileup_t *p, hts_pos_t pos, int depth, + int *base, int *qual) { + int cb, cq; + + if (opts->mode != MODE_SIMPLE) { + consensus_t cons; + calculate_consensus_gap5m(pos, opts->use_mqual ? CONS_MQUAL : 0, + depth, p, opts, &cons, opts->default_qual, + &cons_prob_recall, &cons_prob_precise); + if (cons.depth < opts->min_depth && cons.call != 4) { + // && cons.call != 4. See #2167 + cb = 'N'; + cq = 0; + } else if (cons.het_logodd > 0 && opts->ambig) { + cb = "AMRWa" // 5x5 matrix with ACGT* per row / col + "MCSYc" + "RSGKg" + "WYKTt" + "acgt*"[cons.het_call]; + cq = cons.het_logodd; + } else { + cb = "ACGT*"[cons.call]; + cq = cons.phred; + } + if (cq < opts->cons_cutoff && cb != '*' && + cons.het_call % 5 != 4 && cons.het_call / 5 != 4) { + // het base/* keeps base or * as most likely pure call, else N. + // We still set quality to zero however as this is more useful + // than simply changing the base to N. + cb = 'N'; + cq = 0; + } + } else { + cb = calculate_consensus_simple(p, opts, &cq); + } + if (cb < 0) + return -1; + + *base = cb; + *qual = cq; + + return 0; +} + +/* + * Callback from the pileup algorithm. + * Adds pileup format consensus for a specific column. + * * Returns 0 on success * -1 on failure */ @@ -2031,82 +2190,55 @@ static int basic_pileup(void *cd, samFile *fp, sam_hdr_t *h, pileup_t *p, unsigned char *qp, *cp; char *rp; int ref, cb, cq; - consensus_opts *opts = (consensus_opts *)cd; + ctx *c = (ctx *)cd; + consensus_opts *opts = c->opts; int tid = p->b.core.tid; -// opts->show_ins=0; -// opts->show_del=1; if (!opts->show_ins && nth) return 0; - if (opts->iter) { - if (opts->iter->beg >= pos || opts->iter->end < pos) + if (c->iter) { + if (c->iter->beg >= pos || c->iter->end < pos) return 0; } if (opts->all_bases) { - if (tid != opts->last_tid && opts->last_tid >= -1) { - if (opts->last_tid >= 0) { + if (tid != c->last_tid && c->last_tid >= -1) { + if (c->last_tid >= 0) { // remainder of previous ref - hts_pos_t len = sam_hdr_tid2len(opts->h, opts->last_tid); - if (opts->iter) - len = MIN(opts->iter->end, len); - if (empty_pileup2(opts, opts->h, opts->last_tid, - opts->last_pos, len) < 0) + hts_pos_t len = sam_hdr_tid2len(opts->h, c->last_tid); + if (c->iter) + len = MIN(c->iter->end, len); + if (empty_pileup2(c, opts->h, c->last_tid, opts->nthreads, + c->last_pos, len) < 0) return -1; } - opts->last_pos = opts->iter ? opts->iter->beg : 0; + c->last_pos = c->iter ? c->iter->beg : 0; } // Any refs between last_tid and tid - if (!opts->iter && tid > opts->last_tid && opts->all_bases > 1) { - while (++opts->last_tid < tid) { - hts_pos_t len = sam_hdr_tid2len(opts->h, opts->last_tid); - if (empty_pileup2(opts, opts->h, opts->last_tid, 0, len) < 0) + if (!c->iter && tid > c->last_tid && opts->all_bases > 1) { + while (++c->last_tid < tid) { + hts_pos_t len = sam_hdr_tid2len(opts->h, c->last_tid); + if (empty_pileup2(c, opts->h, c->last_tid, 0, 0, len) < 0) return -1; } } // Any gaps in this ref (same tid) or at start of this new tid - if (opts->last_pos >= 0 && pos > opts->last_pos+1) { - if (empty_pileup2(opts, opts->h, p->b.core.tid, opts->last_pos, - pos-1) < 0) + if (c->last_pos >= 0 && pos > c->last_pos+1) { + if (empty_pileup2(c, opts->h, p->b.core.tid, opts->nthreads, + c->last_pos, pos-1) < 0) return -1; - } else if (opts->last_pos < 0) { - if (empty_pileup2(opts, opts->h, p->b.core.tid, - opts->iter ? opts->iter->beg : 0, pos-1) < 0) + } else if (c->last_pos < 0) { + if (empty_pileup2(c, opts->h, p->b.core.tid, opts->nthreads, + c->iter ? c->iter->beg : 0, pos-1) < 0) return -1; } } - if (opts->mode != MODE_SIMPLE) { - consensus_t cons; - calculate_consensus_gap5m(pos, opts->use_mqual ? CONS_MQUAL : 0, - depth, p, opts, &cons, opts->default_qual, - &cons_prob_recall, &cons_prob_precise); - if (cons.depth < opts->min_depth) { - cb = 'N'; - cq = 0; - } else if (cons.het_logodd > 0 && opts->ambig) { - cb = "AMRWa" // 5x5 matrix with ACGT* per row / col - "MCSYc" - "RSGKg" - "WYKTt" - "acgt*"[cons.het_call]; - cq = cons.het_logodd; - } else{ - cb = "ACGT*"[cons.call]; - cq = cons.phred; - } - if (cq < opts->cons_cutoff && cb != '*') { - cb = 'N'; - cq = 0; - } - } else { - cb = calculate_consensus_simple(p, opts, &cq); - } - if (cb < 0) + if (consensus_base(opts, p, pos, depth, &cb, &cq) < 0) return -1; if (!p) @@ -2116,8 +2248,7 @@ static int basic_pileup(void *cd, samFile *fp, sam_hdr_t *h, pileup_t *p, return 0; /* Ref, pos, nth, score, seq, qual */ - kstring_t *ks = &opts->ks_line; - ks->l = 0; + kstring_t *ks = &c->ks_pileup; ref = p->b.core.tid; rp = (char *)sam_hdr_tid2name(h, ref); @@ -2157,140 +2288,161 @@ static int basic_pileup(void *cd, samFile *fp, sam_hdr_t *h, pileup_t *p, } *cp++ = '\t'; *qp++ = '\n'; - if (fwrite(ks->s, 1, ks->l, opts->fp_out) != ks->l) - return -1; - opts->last_pos = pos; - opts->last_tid = tid; + if (!opts->nthreads) { + if (fwrite(ks->s, 1, ks->l, opts->fp_out) != ks->l) + return -1; + ks->l = 0; + } + + c->last_pos = pos; + c->last_tid = tid; return 0; } +/* + * Callback from the pileup algorithm. + * Adds fastq/fasta format consensus for a specific column. + * + * We either call this for a single thread with the entire region (c->iter) + * or entire file (no iterator), or it gets called repeatedly from threads + * with sub-regions. When we're dealing with the latter we need to track + * which bases we're filling out with Ns so we can trim if needed. + * + * This updates c->ks_ins_{seq,qual} seq and qual + * c->ks_ins_start index to seq/qual for 1st non-N + * c->first_pos first non-N in genome coords + * c->last_pos last genome position processed + * c->last_tid last genome chr processed + * (Amongst other variables) + * + * Returns 0 on success, + * -1 on failure. + */ static int basic_fasta(void *cd, samFile *fp, sam_hdr_t *h, pileup_t *p, int depth, hts_pos_t pos, int nth, int is_insert) { int cb, cq; - consensus_opts *opts = (consensus_opts *)cd; + ctx *c = (ctx *)cd; + consensus_opts *opts = c->opts; int tid = p->b.core.tid; - kstring_t *seq = &opts->ks_ins_seq; - kstring_t *qual = &opts->ks_ins_qual; + kstring_t *seq = &c->ks_ins_seq; + kstring_t *qual = &c->ks_ins_qual; if (!opts->show_ins && nth) return 0; - if (opts->iter) { - if (opts->iter->beg >= pos || opts->iter->end < pos) + if (c->iter) { + if (c->iter->beg >= pos || c->iter->end < pos) return 0; } + if (c->first_pos > pos) + c->first_pos = pos; + next_ref: - if (tid != opts->last_tid) { - if (opts->last_tid != -1) { + if (tid != c->last_tid) { + if (c->last_tid != -1) { if (opts->all_bases) { // Fill in remainder of previous reference int i, N; - if (opts->iter) { - opts->last_pos = MAX(opts->last_pos, opts->iter->beg-1); - N = opts->iter->end; + if (c->iter) { + c->last_pos = MAX(c->last_pos, c->iter->beg-1); + N = c->iter->end; } else { N = INT_MAX; } - N = MIN(N, sam_hdr_tid2len(opts->h, opts->last_tid)) - - opts->last_pos; + N = MIN(N, sam_hdr_tid2len(opts->h, c->last_tid)) + - c->last_pos; if (N > 0) { if (ks_expand(seq, N+1) < 0) return -1; if (ks_expand(qual, N+1) < 0) return -1; - for (i = 0; i < N; i++) { - seq->s[seq->l++] = 'N'; - qual->s[qual->l++] = '!'; + if (c->ref) { + hts_pos_t rlen; + if ((rlen = update_ref(c, c->last_tid)) < 0) + return -1; + for (i = 0; i < N; i++) { + seq->s[seq->l++] = c->ref[c->last_pos+i]; + qual->s[qual->l++] = opts->ref_qual + '!'; + } + } else { + for (i = 0; i < N; i++) { + seq->s[seq->l++] = 'N'; + qual->s[qual->l++] = '!'; + } } seq->s[seq->l] = 0; qual->s[qual->l] = 0; } } - dump_fastq(opts, sam_hdr_tid2name(opts->h, opts->last_tid), + dump_fastq(opts, sam_hdr_tid2name(opts->h, c->last_tid), seq->s, seq->l, qual->s, qual->l); } + if (update_ref(c, tid) < 0) + return -1; seq->l = 0; qual->l = 0; - if (!opts->iter && opts->all_bases > 1 && ++opts->last_tid < tid) { - opts->last_pos = 0; + if (!c->iter && opts->all_bases > 1 && ++c->last_tid < tid) { + c->last_pos = 0; goto next_ref; } - opts->last_tid = tid; - if (opts->iter) - opts->last_pos = opts->iter->beg; + c->last_tid = tid; + if (c->iter) + c->last_pos = opts->all_bases ? c->iter->beg : pos-1; else - opts->last_pos = opts->all_bases ? 0 : pos-1; + c->last_pos = opts->all_bases ? 0 : pos-1; } - // share this with basic_pileup - if (opts->mode != MODE_SIMPLE) { - consensus_t cons; - calculate_consensus_gap5m(pos, opts->use_mqual ? CONS_MQUAL : 0, - depth, p, opts, &cons, opts->default_qual, - &cons_prob_recall, &cons_prob_precise); - if (cons.depth < opts->min_depth) { - cb = 'N'; - cq = 0; - } else if (cons.het_logodd > 0 && opts->ambig) { - cb = "AMRWa" // 5x5 matrix with ACGT* per row / col - "MCSYc" - "RSGKg" - "WYKTt" - "acgt*"[cons.het_call]; - cq = cons.het_logodd; - } else { - cb = "ACGT*"[cons.call]; - cq = cons.phred; - } - if (cq < opts->cons_cutoff && cb != '*' && - cons.het_call % 5 != 4 && cons.het_call / 5 != 4) { - // het base/* keeps base or * as most likely pure call, else N. - // This is because we don't have a traditional way of representing - // base or not-base ambiguity. - cb = 'N'; - cq = 0; - } - } else { - cb = calculate_consensus_simple(p, opts, &cq); - } - if (cb < 0) + if (consensus_base(opts, p, pos, depth, &cb, &cq) < 0) return -1; if (!p) return 0; if (!opts->show_del && cb == '*') { - opts->last_pos = pos; - opts->last_tid = tid; + c->last_pos = pos; + c->last_tid = tid; return 0; } + if (opts->mark_ins && nth && cb != '*') { kputc('_', seq); kputc('_', qual); } - // end of share - // Append consensus base/qual to seqs - if (pos > opts->last_pos) { - if (opts->last_pos >= 0 || opts->all_bases) { - // FIXME: don't expand qual if fasta - if (ks_expand(seq, pos - opts->last_pos) < 0 || - ks_expand(qual, pos - opts->last_pos) < 0) + if (pos > c->last_pos) { + if (c->last_pos > 0 || opts->all_bases) { + if (ks_expand(seq, pos - c->last_pos) < 0 || + (opts->fmt == FASTQ && + ks_expand(qual, pos - c->last_pos) < 0)) return -1; - memset(seq->s + seq->l, 'N', pos - (opts->last_pos+1)); - memset(qual->s + qual->l, '!', pos - (opts->last_pos+1)); - seq->l += pos - (opts->last_pos+1); - qual->l += pos - (opts->last_pos+1); + if (update_ref(c, tid) < 0) + return -1; + if (c->ref) { + // last bases of the previous reference + memcpy(seq->s + seq->l, c->ref + c->last_pos, + pos - (c->last_pos+1)); + if (opts->fmt == FASTQ) + memset(qual->s + qual->l, opts->ref_qual + '!', + pos - (c->last_pos+1)); + } else { + memset(seq->s + seq->l, 'N', pos - (c->last_pos+1)); + if (opts->fmt == FASTQ) + memset(qual->s + qual->l, '!', pos - (c->last_pos+1)); + } + seq->l += pos - (c->last_pos+1); + qual->l += pos - (c->last_pos+1); } } if ((nth && opts->show_ins && cb != '*') - || cb != '*' || (pos > opts->last_pos && opts->show_del)) { + || cb != '*' || (pos > c->last_pos && opts->show_del)) { + if (c->ks_ins_start == -1) + c->ks_ins_start = seq->l; int err = 0; err |= kputc(cb, seq) < 0; err |= kputc(MIN(cq, '~'-'!')+'!', qual) < 0; @@ -2298,12 +2450,466 @@ static int basic_fasta(void *cd, samFile *fp, sam_hdr_t *h, pileup_t *p, return -1; } - opts->last_pos = pos; - opts->last_tid = tid; + c->last_pos = pos; + c->last_tid = tid; + + return 0; +} + +/* + * Computes pileup or fasta/q consensus for a given region. This is executed + * within a worker thread. + */ +void *pileup_job(void *data) { + // A local copy of consensus_opts, per pileup context + ctx *c = (ctx *)data; + consensus_opts *opts = c->opts; + + thread_data_t *tdata = thread_data(opts); + samFile *fp = tdata->fp; + + // Do the pileup job on our local iterator region + c->iter = sam_itr_queryi(tdata->idx, c->tid, c->start, c->end); + pileup_loop(fp, c->h, readaln2, + opts->mode != MODE_SIMPLE ? nm_init : NULL, + c->seq_column, nm_free, c); + + if (opts->fmt == PILEUP && c->last_pos < c->end && opts->all_bases) { + hts_pos_t beg = MAX(c->iter ? c->iter->beg : 0, c->last_pos); + empty_pileup2(c, opts->h, c->tid, 1, beg, c->end); + } + + sam_itr_destroy(c->iter); + + return c; +} + + +// Copy the reference to fastq if known, or fill out Ns. +// Returns 0 on success, -1 on failure +int ref_or_Ns(ctx *c, kstring_t *seq, kstring_t *qual, + hts_pos_t pos, hts_pos_t Nlen) { + consensus_opts *opts = c->opts; + + if (ks_resize(seq, seq->l + Nlen+1) < 0) + return -1; + if (opts->fmt == FASTQ) + if (ks_resize(qual, qual->l + Nlen+1) < 0) + return -1; + + if (opts->ref_fn) { + hts_pos_t rlen; + if ((rlen = update_ref(c, c->tid)) < 0) + return -1; + memcpy(seq->s + seq->l, &c->ref[pos], Nlen); + seq->s[seq->l += Nlen] = 0; + if (opts->fmt == FASTQ) { + memset(qual->s + qual->l, opts->ref_qual + '!', Nlen); + qual->s[qual->l += Nlen] = 0; + } + } else { + memset(seq->s + seq->l, 'N', Nlen); + seq->s[seq->l += Nlen] = 0; + if (opts->fmt == FASTQ) { + memset(qual->s + qual->l, '!', Nlen); + qual->s[qual->l += Nlen] = 0; + } + } return 0; } +// Append a chunk of sequence data in seq/qual to ks. +// If we're in the middle of the requestion range then we need to pad +// any starting and ending locations with Ns. Otherwise we can omit them +// iff the -a option wasn't specified. +int append_cons(ctx *c, kstring_t *seq, kstring_t *qual, + hts_pos_t *used_start, hts_pos_t *used_end) { + consensus_opts *opts = c->opts; + + // Our block returned is based on first..last covered within that block. + // With -a we expand that to start..end of region. Similarly if the block + // is internal (not the first or last chunk) we also have to expand. + if (opts->all_bases || *used_start >= 0) { + hts_pos_t Nlen = c->ks_ins_start == -1 + ? c->end - c->start // entire block is missing + : c->first_pos - c->start - 1; // first covered base + if (Nlen) { + if (ks_resize(seq, seq->l + Nlen+1) < 0) + return -1; + if (ref_or_Ns(c, seq, qual, c->start, Nlen) < 0) + return -1; + + if (*used_start == -1) + *used_start = 0; // we now count the Ns. + if (opts->all_bases) + *used_end = seq->l; + } + } + + // Seq may start with Ns and we may wish to trim them if we've not emitted + // any sequence yet. This is dealt with already in *used_start >= 0 above, + // so we don't want to add twice. + char *ks_seq = c->ks_ins_seq.s, *ks_qual = c->ks_ins_qual.s; + int64_t ks_seq_l = c->ks_ins_seq.l, ks_qual_l = c->ks_ins_qual.l; + if (c->ks_ins_start >= 0) { + ks_seq += c->ks_ins_start; + ks_qual += c->ks_ins_start; + ks_seq_l -= c->ks_ins_start; + ks_qual_l -= c->ks_ins_start; + } + + // The real sequence + if (ks_seq_l) { + kputsn(ks_seq, ks_seq_l, seq); + kputsn(ks_qual, ks_qual_l, qual); + + // Any post Ns + int Nlen = c->end - c->last_pos; + *used_end = seq->l; + *used_start = 0; + if (Nlen) { + if (ref_or_Ns(c, seq, qual, c->last_pos, Nlen) < 0) + return -1; + + if (opts->all_bases) { + *used_end = seq->l; + } + } + } + + return 0; +} + +// Parallel consensus generation. +// +// Executes in the main thread but despatches jobs to the worker thread to +// do the bulk of the pileup/fastq creation. +int pileup_loop_parallel(consensus_opts *opts) { + int chr = 0, err = -1; + hts_pos_t start; + hts_pos_t end; + hts_tpool *pool = NULL; + hts_tpool_process *q = NULL; + + hts_pos_t used_start, used_end; + + //printf("Reg %s:%ld-%ld\n", opts->h->target_name[chr], start, end); + int counter = 0, received = 0; + + thread_data_t *tdata = opts->tdata; + for (int i = 1; i <= opts->nthreads; i++) { + tdata[i].fp = sam_open_format(opts->fn, "r", + (htsFormat *)&opts->ga_in); + if (!tdata[i].fp) + goto err; + if (opts->ref_fn) { + if (!(tdata[i].fai = fai_load(opts->ref_fn))) + goto err; + } + if (tdata[i].fp->format.format == cram) { + // For CRAM, as indices are tied to a file descriptor + tdata[i].idx = sam_index_load(tdata[i].fp, opts->fn); + } else { + tdata[i].idx = tdata[0].idx; + } + } + + pool = hts_tpool_init(opts->nthreads); + q = hts_tpool_process_init(pool, opts->nthreads*2, 0); + hts_tpool_result *r; + opts->pool = pool; + + do { + // next chromosome + used_start = -1; + used_end = 0; + + if (opts->reg) { + sam_parse_region(opts->h, opts->reg, &chr, &start, &end, 0); + if (start < 0) + start = 0; + if (end > opts->h->target_len[chr]) + end = opts->h->target_len[chr]; + } else { + for (; chr < sam_hdr_nref(opts->h); chr++) { + hts_itr_t *itr = sam_itr_queryi(tdata[0].idx, chr, 0, + HTS_POS_MAX); + int finished = itr->finished; + if (finished && opts->all_bases > 1) { + // empty chr + kstring_t seq = KS_INITIALIZE; + kstring_t qual = KS_INITIALIZE; + hts_pos_t rlen = sam_hdr_tid2len(opts->h, chr); + ctx c; + memset(&c, 0, sizeof(c)); + c.h = opts->h; + c.opts = opts; + c.tid = chr; + c.start = 0; + c.end = rlen; + c.ks_ins_start = -1; + + if (opts->fmt == PILEUP) { + if (empty_pileup2(&c, opts->h, chr, 0, 0, rlen) < 0) + return -1; + } else { + hts_pos_t used_start = -1, used_end = 0; + append_cons(&c, &seq, &qual, &used_start, &used_end); + dump_fastq(opts, sam_hdr_tid2name(opts->h, chr), + seq.s, rlen, qual.s, rlen); + } + ks_free(&seq); + ks_free(&qual); + } + sam_itr_destroy(itr); + if (!finished) + break; + } + if (chr == sam_hdr_nref(opts->h)) + goto ret_0; + + start = 0; + end = opts->h->target_len[chr]; + } + + hts_pos_t sub_start = start; + hts_pos_t sub_end = start + opts->span -1; + + ctx *c = NULL; + kstring_t seq = KS_INITIALIZE; + kstring_t qual = KS_INITIALIZE; + while (sub_start < end) { + if (!c) { + c = calloc(1, sizeof(*c)); + c->h = opts->h; + c->tid = chr; + c->start = sub_start; + c->end = sub_end = MIN(sub_start + opts->span, end); + c->counter = counter++; + c->opts = opts; // FIXME: we don't need to copy most of this + c->last_tid = -1; + c->last_pos = -1; + c->first_pos = HTS_POS_MAX; + c->ks_ins_start = -1; + c->seq_column = opts->fmt == PILEUP + ? basic_pileup + : basic_fasta; + c->first = c->start == start; + c->last = c->end == end; + } + + int blk = hts_tpool_dispatch2(pool, q, pileup_job, c, 1); + + // Check for results + while ((r = hts_tpool_next_result(q))) { + ctx *c = (ctx *)hts_tpool_result_data(r); + if (opts->fmt == PILEUP) { + kstring_t *ks = &c->ks_pileup; + if (fwrite(ks->s, 1, ks->l, opts->fp_out) != ks->l) + goto err; + ks_free(ks); + } else { + append_cons(c, &seq, &qual, &used_start, &used_end); + ks_free(&c->ks_ins_seq); + ks_free(&c->ks_ins_qual); + } + hts_tpool_delete_result(r, 1); + received++; + } + + if (blk == -1) { + struct timespec req = { 0, 1000000 }; + nanosleep(&req, NULL); + } else { + c = NULL; + sub_start += opts->span; + sub_end += opts->span; + } + } + + while (received < counter) { + while (!(r = hts_tpool_next_result(q))) { + struct timespec req = { 0, 1000000 }; + nanosleep(&req, NULL); + } + ctx *c = (ctx *)hts_tpool_result_data(r); + if (opts->fmt == PILEUP) { + kstring_t *ks = &c->ks_pileup; + if (ks->l && fwrite(ks->s, 1, ks->l, opts->fp_out) != ks->l) + goto err; + ks_free(ks); + } else { + append_cons(c, &seq, &qual, &used_start, &used_end); + ks_free(&c->ks_ins_seq); + ks_free(&c->ks_ins_qual); + } + hts_tpool_delete_result(r, 1); + received++; + } + + if (opts->fmt != PILEUP) + dump_fastq(opts, sam_hdr_tid2name(opts->h, chr), + seq.s, used_end, qual.s, used_end); + + ks_free(&seq); + ks_free(&qual); + + } while (!opts->reg && ++chr < sam_hdr_nref(opts->h)); + + ret_0: + err = 0; + err: + + // Discard any inflight jobs. Can this happen? Perhaps on error. + while (received < counter) { + while (!(r = hts_tpool_next_result(q))) { + struct timespec req = { 0, 1000000 }; + nanosleep(&req, NULL); + } + ctx *c = (ctx *)hts_tpool_result_data(r); + ks_free(&c->ks_pileup); + ks_free(&c->ks_ins_seq); + ks_free(&c->ks_ins_qual); + hts_tpool_delete_result(r, 1); + + received++; + } + + for (int i = 1; i <= opts->nthreads; i++) { + if (tdata[i].idx && tdata[i].idx != tdata[0].idx) + hts_idx_destroy(tdata[i].idx); + + err |= sam_close(tdata[i].fp)<0; + fai_destroy(tdata[i].fai); + free(tdata[i].ref); + } + + if (q) + hts_tpool_process_destroy(q); + if (pool) + hts_tpool_destroy(pool); + + return err; +} + +/* + * Non-threaded implementation. + * Returns 0 on success + * -1 on failure + */ +int pileup_loop_serial(consensus_opts *opts) { + int ret = -1; + thread_data_t *tdata = &opts->tdata[0]; + + // Serial mode uses a single job rather than breaking the task down into + // regions, but we still need to create a job context for it. + ctx c = { + .ks_pileup = {0,0}, + .ks_ins_seq = {0,0}, + .ks_ins_qual = {0,0}, + .opts = opts, + .last_tid = -1, + .last_pos = -1, + .ref_tid = -1, + .iter = NULL + }; + + if (opts->reg) { + c.iter = sam_itr_querys(opts->tdata[0].idx, opts->h, opts->reg); + if (!c.iter) { + print_error("consensus", "Failed to parse region \"%s\"", + opts->reg); + goto err; + } + } + + if (opts->fmt == PILEUP) { + if (pileup_loop(tdata->fp, opts->h, readaln2, + opts->mode != MODE_SIMPLE ? nm_init : NULL, + basic_pileup, + opts->mode != MODE_SIMPLE ? nm_free : NULL, + &c) < 0) + goto err; + + if (opts->all_bases) { + int tid = c.iter ? c.iter->tid : c.last_tid; + int len = sam_hdr_tid2len(opts->h, tid); + int pos = c.last_pos; + if (c.iter) { + len = MIN(c.iter->end, len); + pos = MAX(c.iter->beg, pos); + } + if (empty_pileup2(&c, opts->h, tid, 0, pos, len) < 0) + goto err; + } + while (!c.iter && opts->all_bases > 1 && + ++c.last_tid < opts->h->n_targets) { + int len = sam_hdr_tid2len(opts->h, c.last_tid); + if (empty_pileup2(&c, opts->h, c.last_tid, 0, 0, len) < 0) + goto err; + } + } else { + if (pileup_loop(tdata->fp, opts->h, readaln2, + opts->mode != MODE_SIMPLE ? nm_init : NULL, + basic_fasta, + opts->mode != MODE_SIMPLE ? nm_free : NULL, + &c) < 0) + goto err; + + next_ref_q: + if (opts->all_bases) { + // fill out terminator + int tid = c.iter ? c.iter->tid : c.last_tid; + int len = sam_hdr_tid2len(opts->h, tid); + int pos = c.last_pos; + if (c.iter) { + len = MIN(c.iter->end, len); + pos = MAX(c.iter->beg, pos); + c.last_tid = c.iter->tid; + } + if (pos < len) { + if (update_ref(&c, c.last_tid) < 0) + goto err; + if (ks_expand(&c.ks_ins_seq, len-pos+1) < 0) + goto err; + if (ks_expand(&c.ks_ins_qual, len-pos+1) < 0) + goto err; + while (pos++ < len) { + c.ks_ins_seq.s [c.ks_ins_seq.l++] = + c.ref ? c.ref[pos-1] : 'N'; + c.ks_ins_qual.s[c.ks_ins_qual.l++] = + (c.ref ? opts->ref_qual : 0) + '!'; + } + c.ks_ins_seq.s [c.ks_ins_seq.l] = 0; + c.ks_ins_qual.s[c.ks_ins_qual.l] = 0; + } + } + if (c.last_tid >= 0) + dump_fastq(opts, sam_hdr_tid2name(opts->h, c.last_tid), + c.ks_ins_seq.s, c.ks_ins_seq.l, + c.ks_ins_qual.s, c.ks_ins_qual.l); + + if (!c.iter && opts->all_bases > 1 && + ++c.last_tid < opts->h->n_targets) { + c.last_pos = 0; + c.ks_ins_seq.l = c.ks_ins_qual.l = 0; + goto next_ref_q; + } + } + + ret = 0; + err: + + ks_free(&c.ks_pileup); + ks_free(&c.ks_ins_seq); + ks_free(&c.ks_ins_qual); + if (c.iter) + hts_itr_destroy(c.iter); + + return ret; +} + // END OF NEW PILEUP //--------------------------------------------------------------------------- @@ -2328,6 +2934,8 @@ static void usage_exit(FILE *fp, int exit_status) { fprintf(fp, " --mark-ins Add '+' before every inserted base/qual [off]\n"); fprintf(fp, " -A, --ambig Enable IUPAC ambiguity codes [off]\n"); fprintf(fp, " -d, --min-depth INT Minimum depth of INT [1]\n"); + fprintf(fp, " -Z, --block-size INT Size of chromosome block (bp) when threading [100000]\n"); + fprintf(fp, " --ref-qual INT QUAL to use for reference bases [0]\n"); fprintf(fp, "\nFor simple consensus mode:\n"); fprintf(fp, " -q, --(no-)use-qual Use quality values in calculation [off]\n"); fprintf(fp, " -c, --call-fract INT At least INT portion of bases must agree [0.75]\n"); @@ -2357,7 +2965,16 @@ static void usage_exit(FILE *fp, int exit_status) { fprintf(fp, " hiseq, hifi, r10.4_sup, r10.4_dup and ultima\n"); fprintf(fp, "\nGlobal options:\n"); - sam_global_opt_help(fp, "-.---@-."); + // Edited sam_global_opt_help(fp, "-.---@-.") help to expand -@ description. + fprintf(fp, " --input-fmt-option OPT[=VAL]\n"); + fprintf(fp, " Specify a single input file format option in the form\n"); + fprintf(fp, " of OPTION or OPTION=VALUE\n"); + fprintf(fp, " -T, --reference FILE\n"); + fprintf(fp, " Reference sequence FASTA FILE [null]\n"); + fprintf(fp, " -@, --threads INT\n"); + fprintf(fp, " Number of additional decompression threads to use [0]\n"); + fprintf(fp, " --verbosity INT\n"); + fprintf(fp, " Set level of verbosity\n"); samtools_exit(exit_status); } @@ -2398,24 +3015,18 @@ int main_consensus(int argc, char **argv) { .het_scale = P_HET_SCALE, .homopoly_fix = 0, .homopoly_redux = 0.01, + .ref_qual = 0, + .span = 500000, // Internal state - .ks_line = {0,0}, - .ks_ins_seq = {0,0}, - .ks_ins_qual = {0,0}, - .fp = NULL, .fp_out = samtools_stdout, - .iter = NULL, - .idx = NULL, - .last_tid = -1, - .last_pos = -1, + .ga_in = SAM_GLOBAL_ARGS_INIT }; set_qcal(&opts.qcal, QCAL_FLAT); - sam_global_args ga = SAM_GLOBAL_ARGS_INIT; static const struct option lopts[] = { - SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', '-', '-', '@'), + SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', '-', 'T', '@'), {"use-qual", no_argument, NULL, 'q'}, {"no-use-qual", no_argument, NULL, 'q'+1000}, {"adj-qual", no_argument, NULL, 'q'+100}, @@ -2458,10 +3069,12 @@ int main_consensus(int argc, char **argv) { {"homopoly-redux", required_argument, NULL, 'p'+200}, {"qual-calibration", required_argument, NULL, 't'}, {"config", required_argument, NULL, 'X'}, + {"ref-qual", required_argument, NULL, 20}, + {"block-size", required_argument, NULL, 'Z'}, {NULL, 0, NULL, 0} }; - while ((c = getopt_long(argc, argv, "@:qd:c:H:r:5f:C:aAl:o:m:pt:X:", + while ((c = getopt_long(argc, argv, "@:qd:c:H:r:5f:C:aAl:o:m:pt:X:T:Z:", lopts, NULL)) >= 0) { switch (c) { case 'a': opts.all_bases++; break; @@ -2499,6 +3112,11 @@ int main_consensus(int argc, char **argv) { case 'm'+101: opts.nm_adjust = 0; break; case 'h'+100: opts.nm_halo = atoi(optarg); break; case 'h'+101: opts.sc_cost = atoi(optarg); break; + case 'Z': + opts.span = atoi(optarg); + if (opts.span < 2) + opts.span = 2; + break; case 'm': // mode if (strcasecmp(optarg, "simple") == 0) { @@ -2536,6 +3154,9 @@ int main_consensus(int argc, char **argv) { opts.fmt = FASTQ; } else if (strcasecmp(optarg, "pileup") == 0) { opts.fmt = PILEUP; + // Pileup uses much more memory so reduce default span size + if (opts.span == 500000) + opts.span = 100000; } else { fprintf(samtools_stderr, "Unknown format %s\n", optarg); return 1; @@ -2628,11 +3249,21 @@ int main_consensus(int argc, char **argv) { print_error("consensus", "failed to load quality calibration '%s'", optarg); - return -1; + return 1; } break; - default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break; + case 'T': // --reference + opts.ref_fn = optarg; + break; + + case 20: + opts.ref_qual = atoi(optarg); + break; + + default: + if (parse_sam_global_opt(c, optarg, lopts, &opts.ga_in) == 0) + break; /* else fall-through */ case '?': usage_exit(samtools_stderr, EXIT_FAILURE); @@ -2681,124 +3312,68 @@ int main_consensus(int argc, char **argv) { if (argc == optind) usage_exit(samtools_stdout, EXIT_SUCCESS); else usage_exit(samtools_stderr, EXIT_FAILURE); } - opts.fp = sam_open_format(argv[optind], "r", &ga.in); - if (opts.fp == NULL) { + + opts.nthreads = opts.ga_in.nthreads; + opts.tdata = calloc(opts.nthreads+1, sizeof(*opts.tdata)); + opts.tdata[0].fp = sam_open_format(argv[optind], "r", + (htsFormat *)&opts.ga_in); + opts.tdata[0].ref_tid = -1; + opts.fn = argv[optind]; + if (opts.tdata[0].fp == NULL) { print_error_errno("consensus", "Cannot open input file \"%s\"", argv[optind]); goto err; } - if (ga.nthreads > 0) - hts_set_threads(opts.fp, ga.nthreads); - if (hts_set_opt(opts.fp, CRAM_OPT_DECODE_MD, 0)) { - fprintf(samtools_stderr, "Failed to set CRAM_OPT_DECODE_MD value\n"); - goto err; + if (opts.ref_fn) { + if (!(opts.tdata[0].fai = fai_load(opts.ref_fn))) { + fprintf(samtools_stderr, "Failed to load fai for %s\n", optarg); + return 1; + } } - if (!(opts.h = sam_hdr_read(opts.fp))) { + + if (!(opts.h = sam_hdr_read(opts.tdata[0].fp))) { fprintf(samtools_stderr, "Failed to read header for \"%s\"\n", argv[optind]); goto err; } if (opts.reg) { - opts.idx = sam_index_load(opts.fp, argv[optind]); - if (!opts.idx) { + opts.tdata[0].idx = sam_index_load(opts.tdata[0].fp, argv[optind]); + if (!opts.tdata[0].idx) { print_error("consensus", "Cannot load index for input file \"%s\"", argv[optind]); goto err; } - opts.iter = sam_itr_querys(opts.idx, opts.h, opts.reg); - if (!opts.iter) { - print_error("consensus", "Failed to parse region \"%s\"", - opts.reg); - goto err; + } else if (opts.nthreads) { + // This is acceptable to fail. It just means threads are decompression + opts.tdata[0].idx = sam_index_load(opts.tdata[0].fp, argv[optind]); + if (!opts.tdata[0].idx) { + fprintf(samtools_stderr, "No index: multi-threading is limited to decompression only\n"); + // always consider doing this? + hts_set_threads(opts.tdata[0].fp, opts.nthreads); } } - if (opts.fmt == PILEUP) { - if (pileup_loop(opts.fp, opts.h, readaln2, - opts.mode != MODE_SIMPLE ? nm_init : NULL, - basic_pileup, - opts.mode != MODE_SIMPLE ? nm_free : NULL, - &opts) < 0) + if (opts.nthreads && opts.tdata[0].idx) { + if (pileup_loop_parallel(&opts) < 0) goto err; - - if (opts.all_bases) { - int tid = opts.iter ? opts.iter->tid : opts.last_tid; - int len = sam_hdr_tid2len(opts.h, tid); - int pos = opts.last_pos; - if (opts.iter) { - len = MIN(opts.iter->end, len); - pos = MAX(opts.iter->beg, pos); - } - if (empty_pileup2(&opts, opts.h, tid, pos, len) < 0) - goto err; - } - while (!opts.iter && opts.all_bases > 1 && - ++opts.last_tid < opts.h->n_targets) { - int len = sam_hdr_tid2len(opts.h, opts.last_tid); - if (empty_pileup2(&opts, opts.h, opts.last_tid, 0, len) < 0) - goto err; - } - } else { - if (pileup_loop(opts.fp, opts.h, readaln2, - opts.mode != MODE_SIMPLE ? nm_init : NULL, - basic_fasta, - opts.mode != MODE_SIMPLE ? nm_free : NULL, - &opts) < 0) + if (pileup_loop_serial(&opts) < 0) goto err; - - next_ref_q: - if (opts.all_bases) { - // fill out terminator - int tid = opts.iter ? opts.iter->tid : opts.last_tid; - int len = sam_hdr_tid2len(opts.h, tid); - int pos = opts.last_pos; - if (opts.iter) { - len = MIN(opts.iter->end, len); - pos = MAX(opts.iter->beg, pos); - opts.last_tid = opts.iter->tid; - } - if (pos < len) { - if (ks_expand(&opts.ks_ins_seq, len-pos+1) < 0) - goto err; - if (ks_expand(&opts.ks_ins_qual, len-pos+1) < 0) - goto err; - while (pos++ < len) { - opts.ks_ins_seq.s [opts.ks_ins_seq.l++] = 'N'; - opts.ks_ins_qual.s[opts.ks_ins_qual.l++] = '!'; - } - opts.ks_ins_seq.s [opts.ks_ins_seq.l] = 0; - opts.ks_ins_qual.s[opts.ks_ins_qual.l] = 0; - } - } - if (opts.last_tid >= 0) - dump_fastq(&opts, sam_hdr_tid2name(opts.h, opts.last_tid), - opts.ks_ins_seq.s, opts.ks_ins_seq.l, - opts.ks_ins_qual.s, opts.ks_ins_qual.l); - - if (!opts.iter && opts.all_bases > 1 && - ++opts.last_tid < opts.h->n_targets) { - opts.last_pos = 0; - opts.ks_ins_seq.l = opts.ks_ins_qual.l = 0; - goto next_ref_q; - } -// if (consensus_loop(&opts) < 0) { -// print_error_errno("consensus", "Failed"); -// goto err; -// } } ret = 0; err: - if (opts.iter) - hts_itr_destroy(opts.iter); - if (opts.idx) - hts_idx_destroy(opts.idx); + if (opts.tdata[0].fai) + fai_destroy(opts.tdata[0].fai); + free(opts.tdata[0].ref); - if (opts.fp && sam_close(opts.fp) < 0) { + if (opts.tdata[0].idx) + hts_idx_destroy(opts.tdata[0].idx); + + if (opts.tdata[0].fp && sam_close(opts.tdata[0].fp) < 0) { print_error_errno("consensus", "Closing input file \"%s\"", argv[optind]); ret = 1; @@ -2806,19 +3381,17 @@ int main_consensus(int argc, char **argv) { if (opts.h) sam_hdr_destroy(opts.h); - sam_global_args_free(&ga); + sam_global_args_free(&opts.ga_in); if (opts.fp_out && opts.fp_out != samtools_stdout) ret |= fclose(opts.fp_out) != 0; else ret |= fflush(samtools_stdout) != 0; - ks_free(&opts.ks_line); - ks_free(&opts.ks_ins_seq); - ks_free(&opts.ks_ins_qual); - if (ret) print_error("consensus", "failed"); + free(opts.tdata); + return ret; } diff --git a/samtools/bam_consensus_tab.h b/samtools/bam_consensus_tab.h new file mode 100644 index 00000000..1e6229dc --- /dev/null +++ b/samtools/bam_consensus_tab.h @@ -0,0 +1,404 @@ +/* bam_consensus_tab.h -- constant lookup tables for the consensus algorithm + + Copyright (C) 2024 Genome Research Ltd. + Author: James Bonfield + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notices and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. */ + + + +/* + * Tables created with this code: + * + * for (int i = 0; i <= 100; i++) + * q2p[i] = pow(10, -i/10.0); + * + * for (int i = 0; i < 255; i++) + * mqual_pow_1m[i] = pow(10, -(i*.9)/10.0); + * + * // unknown mqual + * mqual_pow_1m[255] = mqual_pow_1m[10]; + * + * We use static tables to avoid initialising from within a thread and having + * to use pthread_once calls or locks. + */ + +static double q2p[101] = { + 1, + 0.7943282347242814900312, + 0.6309573444801932495807, + 0.5011872336272722439077, + 0.3981071705534972027252, + 0.3162277660168379411765, + 0.251188643150958013095, + 0.1995262314968879735488, + 0.1584893192461113431424, + 0.1258925411794167281698, + 0.1000000000000000055511, + 0.07943282347242813790089, + 0.06309573444801933050918, + 0.05011872336272722022743, + 0.0398107170553497341503, + 0.03162277660168379134209, + 0.0251188643150957943706, + 0.01995262314968879874266, + 0.01584893192461113431424, + 0.01258925411794167489865, + 0.01000000000000000020817, + 0.007943282347242813790089, + 0.006309573444801930275361, + 0.005011872336272724624828, + 0.00398107170553497341503, + 0.003162277660168379394418, + 0.00251188643150957943706, + 0.001995262314968878920168, + 0.001584893192461114081945, + 0.001258925411794167489865, + 0.001000000000000000020817, + 0.0007943282347242813139568, + 0.000630957344480192984168, + 0.000501187233627272527535, + 0.0003981071705534973523451, + 0.0003162277660168379394418, + 0.000251188643150957954548, + 0.0001995262314968878757538, + 0.0001584893192461114190366, + 0.0001258925411794167435655, + 0.0001000000000000000047922, + 7.943282347242822084236e-05, + 6.309573444801929299579e-05, + 5.011872336272725139824e-05, + 3.981071705534969457693e-05, + 3.162277660168379529943e-05, + 2.511886431509582188223e-05, + 1.995262314968878622013e-05, + 1.584893192461114122603e-05, + 1.258925411794166114284e-05, + 1.000000000000000081803e-05, + 7.943282347242821745423e-06, + 6.309573444801929638392e-06, + 5.011872336272724970418e-06, + 3.981071705534969118879e-06, + 3.16227766016837919113e-06, + 2.511886431509582272927e-06, + 1.995262314968878706716e-06, + 1.584893192461114080251e-06, + 1.258925411794166114284e-06, + 9.999999999999999547481e-07, + 7.943282347242821957181e-07, + 6.309573444801929638392e-07, + 5.011872336272724970418e-07, + 3.981071705534969224758e-07, + 3.16227766016837919113e-07, + 2.511886431509582272927e-07, + 1.995262314968878706716e-07, + 1.584893192461114080251e-07, + 1.258925411794166167223e-07, + 9.999999999999999547481e-08, + 7.943282347242821957181e-08, + 6.309573444801929638392e-08, + 5.011872336272724970418e-08, + 3.981071705534968960061e-08, + 3.16227766016837919113e-08, + 2.511886431509582074403e-08, + 1.995262314968878640542e-08, + 1.584893192461114278775e-08, + 1.258925411794166101049e-08, + 1.000000000000000020923e-08, + 7.943282347242821957181e-09, + 6.309573444801942873282e-09, + 5.011872336272714713378e-09, + 3.981071705534968960061e-09, + 3.162277660168379522002e-09, + 2.511886431509581991685e-09, + 1.995262314968882776445e-09, + 1.584893192461110928693e-09, + 1.258925411794166266485e-09, + 1.000000000000000062282e-09, + 7.943282347242821750386e-10, + 6.309573444801942459691e-10, + 5.011872336272714299788e-10, + 3.981071705534969373651e-10, + 3.162277660168379522002e-10, + 2.511886431509581991685e-10, + 1.995262314968882828143e-10, + 1.584893192461110876994e-10, + 1.258925411794166163087e-10, + 1.000000000000000036432e-10, +}; + +// 1-mqual_pow() in old code, for improved numerical stability +static double mqual_pow_1m[256] = { + 1, + 0.8128305161640992926309, + 0.6606934480075959958967, + 0.5370317963702526675718, + 0.4365158322401659907186, + 0.3548133892335754713265, + 0.2884031503126605611165, + 0.2344228815319921999105, + 0.1905460717963247396778, + 0.1548816618912481624104, + 0.1258925411794167281698, + 0.1023292992280754182266, + 0.08317637711026708291495, + 0.06760829753919815610086, + 0.05495408738576245538665, + 0.04466835921509629964143, + 0.03630780547701013827311, + 0.02951209226666385437854, + 0.02398832919019491111134, + 0.01949844599758044499938, + 0.01584893192461113431424, + 0.01288249551693133529429, + 0.01047128548050899575494, + 0.008511380382023767468547, + 0.00691830970918936270786, + 0.005623413251903490971129, + 0.004570881896148746952047, + 0.003715352290971724036339, + 0.003019951720402016138306, + 0.002454708915685028559894, + 0.001995262314968878920168, + 0.001621810097358929734015, + 0.001318256738556407509436, + 0.00107151930523760705212, + 0.0008709635899560805045311, + 0.0007079457843841380172353, + 0.0005754399373371571826952, + 0.0004677351412871976464071, + 0.0003801893963205608571385, + 0.0003090295432513588810494, + 0.000251188643150957954548, + 0.0002041737944669529570153, + 0.0001659586907437559614043, + 0.0001348962882591653344218, + 0.0001096478196143185149941, + 8.912509381337458780163e-05, + 7.244359600749906166837e-05, + 5.888436553555884004144e-05, + 4.786300923226380360119e-05, + 3.890451449942804569787e-05, + 3.162277660168379529943e-05, + 2.570395782768864487763e-05, + 2.08929613085403650377e-05, + 1.698243652461742535286e-05, + 1.380384264602883870635e-05, + 1.122018454301963009219e-05, + 9.12010839355909637069e-06, + 7.4131024130091619111e-06, + 6.025595860743568906272e-06, + 4.897788193684456601576e-06, + 3.981071705534969118879e-06, + 3.235936569296280900269e-06, + 2.630267991895381665999e-06, + 2.137962089502232643917e-06, + 1.737800828749376344341e-06, + 1.412537544622755410526e-06, + 1.148153621496884006233e-06, + 9.332543007969905085716e-07, + 7.585775750291835657364e-07, + 6.165950018614821914709e-07, + 5.011872336272724970418e-07, + 4.073802778041130314065e-07, + 3.311311214825914277902e-07, + 2.691534803926913785907e-07, + 2.187761623949547394535e-07, + 1.778279410038922689111e-07, + 1.445439770745924971116e-07, + 1.174897554939530334775e-07, + 9.549925860214348974899e-08, + 7.762471166286895849136e-08, + 6.309573444801929638392e-08, + 5.128613839913637503499e-08, + 4.16869383470335490351e-08, + 3.388441561392020662586e-08, + 2.754228703338163286523e-08, + 2.238721138568337809819e-08, + 1.819700858609982555977e-08, + 1.479108388168207227345e-08, + 1.202264434617413033159e-08, + 9.772372209558071266503e-09, + 7.943282347242821957181e-09, + 6.456542290346535928048e-09, + 5.24807460249773376564e-09, + 4.265795188015916487035e-09, + 3.467368504525309635778e-09, + 2.818382931264449340833e-09, + 2.290867652767769951333e-09, + 1.862087136662865556652e-09, + 1.513561248436207274854e-09, + 1.230268770812381144727e-09, + 1.000000000000000062282e-09, + 8.128305161640995407296e-10, + 6.606934480075964084586e-10, + 5.370317963702532108845e-10, + 4.365158322401647369267e-10, + 3.548133892335760548437e-10, + 2.884031503126599798765e-10, + 2.344228815319927378371e-10, + 1.905460717963244402743e-10, + 1.548816618912479562773e-10, + 1.258925411794166163087e-10, + 1.02329299228075369211e-10, + 8.317637711026709156723e-11, + 6.760829753919819170079e-11, + 5.495408738576225939084e-11, + 4.466835921509634552494e-11, + 3.630780547701002543629e-11, + 2.951209226666390341713e-11, + 2.398832919019485080354e-11, + 1.949844599758041441061e-11, + 1.584893192461110747747e-11, + 1.288249551693132197657e-11, + 1.047128548050898584033e-11, + 8.51138038202375965546e-12, + 6.91830970918936233983e-12, + 5.623413251903490440623e-12, + 4.57088189614875156966e-12, + 3.715352290971727542496e-12, + 3.019951720402019283787e-12, + 2.454708915685023331606e-12, + 1.995262314968882747364e-12, + 1.621810097358926597075e-12, + 1.318256738556410024608e-12, + 1.071519305237604909644e-12, + 8.709635899560796872095e-13, + 7.079457843841373745556e-13, + 5.754399373371566766933e-13, + 4.677351412871980722804e-13, + 3.801893963205612663477e-13, + 3.090295432513579532869e-13, + 2.51188643150958201188e-13, + 2.041737944669523298356e-13, + 1.659586907437563152957e-13, + 1.348962882591650639904e-13, + 1.096478196143187403169e-13, + 8.912509381337441182976e-14, + 7.244359600749891290308e-14, + 5.888436553555884286751e-14, + 4.786300923226360800767e-14, + 3.890451449942804581978e-14, + 3.16227766016837956946e-14, + 2.570395782768864629488e-14, + 2.089296130854032171409e-14, + 1.698243652461738958356e-14, + 1.380384264602886572573e-14, + 1.122018454301965225833e-14, + 9.120108393559077808785e-15, + 7.41310241300916204595e-15, + 6.025595860743543893752e-15, + 4.89778819368447641006e-15, + 3.981071705534969483965e-15, + 3.235936569296280957472e-15, + 2.630267991895370861998e-15, + 2.137962089502223792495e-15, + 1.737800828749376410924e-15, + 1.412537544622755396163e-15, + 1.148153621496879413097e-15, + 9.332543007969886808596e-16, + 7.585775750291820704083e-16, + 6.16595001861483473253e-16, + 5.011872336272714560554e-16, + 4.073802778041122139769e-16, + 3.311311214825907705381e-16, + 2.691534803926902844989e-16, + 2.187761623949551861095e-16, + 1.778279410038922824675e-16, + 1.445439770745928054799e-16, + 1.174897554939525594999e-16, + 9.549925860214290967941e-17, + 7.762471166286926954938e-17, + 6.309573444801942677654e-17, + 5.128613839913658731403e-17, + 4.168693834703329160522e-17, + 3.388441561392006804112e-17, + 2.754228703338174659151e-17, + 2.238721138568347026224e-17, + 1.819700858609975128112e-17, + 1.479108388168200988561e-17, + 1.202264434617408187405e-17, + 9.772372209558151065569e-18, + 7.943282347242789549202e-18, + 6.456542290346536037433e-18, + 5.248074602497711949195e-18, + 4.265795188015916572965e-18, + 3.467368504525309543749e-18, + 2.818382931264449082965e-18, + 2.290867652767769905885e-18, + 1.862087136662865733356e-18, + 1.513561248436207116967e-18, + 1.23026877081238105721e-18, + 1.000000000000000071542e-18, + 8.128305161640995455277e-19, + 6.606934480075964466392e-19, + 5.370317963702488787674e-19, + 4.365158322401665295207e-19, + 3.548133892335760584637e-19, + 2.884031503126611635381e-19, + 2.34422881531990820968e-19, + 1.905460717963236485458e-19, + 1.548816618912485883656e-19, + 1.258925411794171323845e-19, + 1.023292992280749404264e-19, + 8.317637711026674950186e-20, + 6.760829753919791474259e-20, + 5.495408738576271148274e-20, + 4.466835921509616402226e-20, + 3.630780547701002879554e-20, + 2.951209226666377752396e-20, + 2.398832919019485115055e-20, + 1.949844599758041376456e-20, + 1.584893192461110764057e-20, + 1.288249551693132282107e-20, + 1.047128548050898507945e-20, + 8.511380382023758413114e-21, + 6.918309709189362127144e-21, + 5.623413251903490551057e-21, + 4.570881896148751551493e-21, + 3.715352290971727550311e-21, + 3.019951720401994286355e-21, + 2.454708915685033408223e-21, + 1.995262314968882699746e-21, + 1.621810097358933154088e-21, + 1.318256738556399319099e-21, + 1.071519305237600521042e-21, + 8.709635899560831951348e-22, + 7.079457843841402028864e-22, + 5.754399373371542511646e-22, + 4.677351412871961918683e-22, + 3.801893963205596893242e-22, + 3.090295432513604871118e-22, + 2.511886431509571816516e-22, + 2.041737944669523411158e-22, + 1.659586907437556324857e-22, + 1.348962882591650593016e-22, + 1.096478196143182849571e-22, + 8.912509381337440520642e-23, + 7.244359600749890755459e-23, + 5.888436553555883904271e-23, + 4.786300923226380123649e-23, + 3.890451449942804910175e-23, + 3.162277660168379311786e-23, + 2.570395782768864541526e-23, + 2.089296130854040880659e-23, + 1.698243652461732031437e-23, + 1.380384264602886699006e-23, + 0.1258925411794167281698, +}; diff --git a/samtools/bam_fastq.c b/samtools/bam_fastq.c index fadcccb7..a7f791af 100644 --- a/samtools/bam_fastq.c +++ b/samtools/bam_fastq.c @@ -563,6 +563,8 @@ static bool init_state(const bam2fq_opts_t* opts, bam2fq_state_t** state_out) else rf |= SAM_AUX; } + if (opts->illumina_tag || opts->copy_tags) + rf |= SAM_AUX; if (hts_set_opt(state->fp, CRAM_OPT_REQUIRED_FIELDS, rf)) { fprintf(stderr, "Failed to set CRAM_OPT_REQUIRED_FIELDS value\n"); free(state); @@ -656,6 +658,15 @@ static bool init_state(const bam2fq_opts_t* opts, bam2fq_state_t** state_out) return false; } + kstring_t str = KS_INITIALIZE; + if (sam_hdr_find_tag_hd(state->h, "SO", &str) == 0 && + strcmp(str.s, "coordinate") == 0) { + print_error(opts->filetype == FASTA ? "fasta" : "fastq", + "Coordinate sorted file. " + "Read pairs may be out of order"); + } + ks_free(&str); + *state_out = state; return true; } diff --git a/samtools/bam_fastq.c.pysam.c b/samtools/bam_fastq.c.pysam.c index 627f741a..e4c17719 100644 --- a/samtools/bam_fastq.c.pysam.c +++ b/samtools/bam_fastq.c.pysam.c @@ -565,6 +565,8 @@ static bool init_state(const bam2fq_opts_t* opts, bam2fq_state_t** state_out) else rf |= SAM_AUX; } + if (opts->illumina_tag || opts->copy_tags) + rf |= SAM_AUX; if (hts_set_opt(state->fp, CRAM_OPT_REQUIRED_FIELDS, rf)) { fprintf(samtools_stderr, "Failed to set CRAM_OPT_REQUIRED_FIELDS value\n"); free(state); @@ -658,6 +660,15 @@ static bool init_state(const bam2fq_opts_t* opts, bam2fq_state_t** state_out) return false; } + kstring_t str = KS_INITIALIZE; + if (sam_hdr_find_tag_hd(state->h, "SO", &str) == 0 && + strcmp(str.s, "coordinate") == 0) { + print_error(opts->filetype == FASTA ? "fasta" : "fastq", + "Coordinate sorted file. " + "Read pairs may be out of order"); + } + ks_free(&str); + *state_out = state; return true; } diff --git a/samtools/bam_markdup.c b/samtools/bam_markdup.c index 17fedd58..ee286191 100644 --- a/samtools/bam_markdup.c +++ b/samtools/bam_markdup.c @@ -1,7 +1,7 @@ /* bam_markdup.c -- Mark duplicates from a coord sorted file that has gone through fixmates with the mate scoring option on. - Copyright (C) 2017-2023 Genome Research Ltd. + Copyright (C) 2017-2024 Genome Research Ltd. Author: Andrew Whitwham @@ -1045,6 +1045,19 @@ static inline int optical_retag(md_param_t *param, khash_t(duplicates) *dup_hash } +/* This is based on the fixmate actions for when only one of a pair is present */ +static inline int has_mate(bam1_t *b) { + int ret = 0; + + if ((b->core.flag & BAM_FPAIRED) && !(b->core.flag & BAM_FMUNMAP) && + !((b->core.mtid == -1) && (b->core.mpos == -1))) { + ret = 1; + } + + return ret; +} + + /* Check all duplicates of the highest quality read (the "original") for consistancy. Also pre-calculate any values for use in check_duplicate_chain later. Returns 0 on success, >0 on coordinate reading error (program can continue) or @@ -1118,7 +1131,7 @@ static int check_chain_against_original(md_param_t *param, khash_t(duplicates) * uint8_t *data; char *dup_type; int is_opt = 0; - int current_paired = (current->b->core.flag & BAM_FPAIRED) && !(current->b->core.flag & BAM_FMUNMAP); + int current_paired = has_mate(current->b); if ((data = bam_aux_get(current->b, "dt"))) { if ((dup_type = bam_aux2Z(data))) { @@ -1202,7 +1215,7 @@ static int check_duplicate_chain(md_param_t *param, khash_t(duplicates) *dup_has while (curr < end_name_match) { size_t count = curr; check_t *current = &list->c[curr]; - int current_paired = (current->b->core.flag & BAM_FPAIRED) && !(current->b->core.flag & BAM_FMUNMAP); + int current_paired = has_mate(current->b); while (++count < end_name_match && (list->c[count].x - current->x <= param->opt_dist)) { // while close enough along the x coordinate @@ -1224,7 +1237,7 @@ static int check_duplicate_chain(md_param_t *param, khash_t(duplicates) *dup_has // optical duplicates int chk_dup = 0; - int chk_paired = (chk->b->core.flag & BAM_FPAIRED) && !(chk->b->core.flag & BAM_FMUNMAP); + int chk_paired = has_mate(chk->b); if (current_paired != chk_paired) { if (!chk_paired) { @@ -1688,7 +1701,7 @@ static int bam_mark_duplicates(md_param_t *param) { // look at the pairs first - if ((in_read->b->core.flag & BAM_FPAIRED) && !(in_read->b->core.flag & BAM_FMUNMAP)) { + if (has_mate(in_read->b)) { int ret, mate_tmp; key_data_t pair_key; key_data_t single_key; @@ -1716,7 +1729,7 @@ static int bam_mark_duplicates(md_param_t *param) { // look at singles only for duplication marking bp = &kh_val(single_hash, k); - if (!(bp->p->b->core.flag & BAM_FPAIRED) || (bp->p->b->core.flag & BAM_FMUNMAP)) { + if (!has_mate(bp->p->b)) { // singleton will always be marked duplicate even if // scores more than one read of the pair bam1_t *dup = bp->p->b; @@ -1861,7 +1874,7 @@ static int bam_mark_duplicates(md_param_t *param) { } else if (ret == 0) { // exists bp = &kh_val(single_hash, k); - if ((bp->p->b->core.flag & BAM_FPAIRED) && !(bp->p->b->core.flag & BAM_FMUNMAP)) { + if (has_mate(bp->p->b)) { // if matched against one of a pair just mark as duplicate if (param->check_chain) { diff --git a/samtools/bam_markdup.c.pysam.c b/samtools/bam_markdup.c.pysam.c index cdf33c1a..b08ea3e3 100644 --- a/samtools/bam_markdup.c.pysam.c +++ b/samtools/bam_markdup.c.pysam.c @@ -3,7 +3,7 @@ /* bam_markdup.c -- Mark duplicates from a coord sorted file that has gone through fixmates with the mate scoring option on. - Copyright (C) 2017-2023 Genome Research Ltd. + Copyright (C) 2017-2024 Genome Research Ltd. Author: Andrew Whitwham @@ -1047,6 +1047,19 @@ static inline int optical_retag(md_param_t *param, khash_t(duplicates) *dup_hash } +/* This is based on the fixmate actions for when only one of a pair is present */ +static inline int has_mate(bam1_t *b) { + int ret = 0; + + if ((b->core.flag & BAM_FPAIRED) && !(b->core.flag & BAM_FMUNMAP) && + !((b->core.mtid == -1) && (b->core.mpos == -1))) { + ret = 1; + } + + return ret; +} + + /* Check all duplicates of the highest quality read (the "original") for consistancy. Also pre-calculate any values for use in check_duplicate_chain later. Returns 0 on success, >0 on coordinate reading error (program can continue) or @@ -1120,7 +1133,7 @@ static int check_chain_against_original(md_param_t *param, khash_t(duplicates) * uint8_t *data; char *dup_type; int is_opt = 0; - int current_paired = (current->b->core.flag & BAM_FPAIRED) && !(current->b->core.flag & BAM_FMUNMAP); + int current_paired = has_mate(current->b); if ((data = bam_aux_get(current->b, "dt"))) { if ((dup_type = bam_aux2Z(data))) { @@ -1204,7 +1217,7 @@ static int check_duplicate_chain(md_param_t *param, khash_t(duplicates) *dup_has while (curr < end_name_match) { size_t count = curr; check_t *current = &list->c[curr]; - int current_paired = (current->b->core.flag & BAM_FPAIRED) && !(current->b->core.flag & BAM_FMUNMAP); + int current_paired = has_mate(current->b); while (++count < end_name_match && (list->c[count].x - current->x <= param->opt_dist)) { // while close enough along the x coordinate @@ -1226,7 +1239,7 @@ static int check_duplicate_chain(md_param_t *param, khash_t(duplicates) *dup_has // optical duplicates int chk_dup = 0; - int chk_paired = (chk->b->core.flag & BAM_FPAIRED) && !(chk->b->core.flag & BAM_FMUNMAP); + int chk_paired = has_mate(chk->b); if (current_paired != chk_paired) { if (!chk_paired) { @@ -1690,7 +1703,7 @@ static int bam_mark_duplicates(md_param_t *param) { // look at the pairs first - if ((in_read->b->core.flag & BAM_FPAIRED) && !(in_read->b->core.flag & BAM_FMUNMAP)) { + if (has_mate(in_read->b)) { int ret, mate_tmp; key_data_t pair_key; key_data_t single_key; @@ -1718,7 +1731,7 @@ static int bam_mark_duplicates(md_param_t *param) { // look at singles only for duplication marking bp = &kh_val(single_hash, k); - if (!(bp->p->b->core.flag & BAM_FPAIRED) || (bp->p->b->core.flag & BAM_FMUNMAP)) { + if (!has_mate(bp->p->b)) { // singleton will always be marked duplicate even if // scores more than one read of the pair bam1_t *dup = bp->p->b; @@ -1863,7 +1876,7 @@ static int bam_mark_duplicates(md_param_t *param) { } else if (ret == 0) { // exists bp = &kh_val(single_hash, k); - if ((bp->p->b->core.flag & BAM_FPAIRED) && !(bp->p->b->core.flag & BAM_FMUNMAP)) { + if (has_mate(bp->p->b)) { // if matched against one of a pair just mark as duplicate if (param->check_chain) { diff --git a/samtools/bam_mate.c b/samtools/bam_mate.c index 8fb02cf6..cf4e46d1 100644 --- a/samtools/bam_mate.c +++ b/samtools/bam_mate.c @@ -393,6 +393,10 @@ int bam_sanitize_options(const char *str) { opt |= FIX_MQUAL; else if (strncmp(str_start, "unmap", 5) == 0) opt |= FIX_UNMAP; + else if (strncmp(str_start, "cigdup", 6) == 0) + opt |= FIX_CIGDUP; + else if (strncmp(str_start, "cigarx", 6) == 0) + opt |= FIX_CIGARX | FIX_CIGDUP; else if (strncmp(str_start, "cigar", 5) == 0) opt |= FIX_CIGAR; else if (strncmp(str_start, "aux", 3) == 0) @@ -470,6 +474,53 @@ int bam_sanitize(sam_hdr_t *h, bam1_t *b, int flags) { } } + if ((flags & FIX_CIGARX) && !(b->core.flag & BAM_FUNMAP)) { + // Turn CIGAR = and X into M. These will then be merged together + // by CIGDUP below. Ie 10=1X9= becomes 20M. + int i; + uint32_t *cig = bam_get_cigar(b); + for (i = 0; i < b->core.n_cigar; i++) { + int op = bam_cigar_op(cig[i]); + if (op == BAM_CEQUAL || op == BAM_CDIFF) + cig[i] = bam_cigar_gen(bam_cigar_oplen(cig[i]), BAM_CMATCH); + } + } + + if ((flags & FIX_CIGDUP) && !(b->core.flag & BAM_FUNMAP)) { + // Canonicalise cigar strings, so xMyM becomes (x+y)M. + int nc = b->core.n_cigar, i, j; + uint32_t *cigf = bam_get_cigar(b), *cigt = cigf; + uint32_t last_op = -1, last_len = 0; + + for (i = j = 0; i < nc; i++) { + int op = bam_cigar_op(cigf[i]); + int len = bam_cigar_oplen(cigf[i]); + if (op == last_op) { + if (last_len + len >= (1<<28)) { + cigt[j-1] = bam_cigar_gen((1<<28)-1, op); + len -= (1<<28)-1; + cigt[j++] = bam_cigar_gen(len, op); + } else { + cigt[j-1] = bam_cigar_gen(len = last_len + len, op); + } + } else if (len > 0) { + cigt[j++] = cigf[i]; + } + if (len > 0) { + last_op = op; + last_len = len; + } + } + + if (j != nc) { + // Collapsed CIGAR so move data down + b->core.n_cigar = j; + uint8_t *endp = b->data + b->l_data; + memmove(cigf + j, cigf + nc, endp - (uint8_t *)(cigf + nc)); + b->l_data -= 4*(nc-j); + } + } + return 0; } diff --git a/samtools/bam_mate.c.pysam.c b/samtools/bam_mate.c.pysam.c index a0ba8205..6740b69b 100644 --- a/samtools/bam_mate.c.pysam.c +++ b/samtools/bam_mate.c.pysam.c @@ -395,6 +395,10 @@ int bam_sanitize_options(const char *str) { opt |= FIX_MQUAL; else if (strncmp(str_start, "unmap", 5) == 0) opt |= FIX_UNMAP; + else if (strncmp(str_start, "cigdup", 6) == 0) + opt |= FIX_CIGDUP; + else if (strncmp(str_start, "cigarx", 6) == 0) + opt |= FIX_CIGARX | FIX_CIGDUP; else if (strncmp(str_start, "cigar", 5) == 0) opt |= FIX_CIGAR; else if (strncmp(str_start, "aux", 3) == 0) @@ -472,6 +476,53 @@ int bam_sanitize(sam_hdr_t *h, bam1_t *b, int flags) { } } + if ((flags & FIX_CIGARX) && !(b->core.flag & BAM_FUNMAP)) { + // Turn CIGAR = and X into M. These will then be merged together + // by CIGDUP below. Ie 10=1X9= becomes 20M. + int i; + uint32_t *cig = bam_get_cigar(b); + for (i = 0; i < b->core.n_cigar; i++) { + int op = bam_cigar_op(cig[i]); + if (op == BAM_CEQUAL || op == BAM_CDIFF) + cig[i] = bam_cigar_gen(bam_cigar_oplen(cig[i]), BAM_CMATCH); + } + } + + if ((flags & FIX_CIGDUP) && !(b->core.flag & BAM_FUNMAP)) { + // Canonicalise cigar strings, so xMyM becomes (x+y)M. + int nc = b->core.n_cigar, i, j; + uint32_t *cigf = bam_get_cigar(b), *cigt = cigf; + uint32_t last_op = -1, last_len = 0; + + for (i = j = 0; i < nc; i++) { + int op = bam_cigar_op(cigf[i]); + int len = bam_cigar_oplen(cigf[i]); + if (op == last_op) { + if (last_len + len >= (1<<28)) { + cigt[j-1] = bam_cigar_gen((1<<28)-1, op); + len -= (1<<28)-1; + cigt[j++] = bam_cigar_gen(len, op); + } else { + cigt[j-1] = bam_cigar_gen(len = last_len + len, op); + } + } else if (len > 0) { + cigt[j++] = cigf[i]; + } + if (len > 0) { + last_op = op; + last_len = len; + } + } + + if (j != nc) { + // Collapsed CIGAR so move data down + b->core.n_cigar = j; + uint8_t *endp = b->data + b->l_data; + memmove(cigf + j, cigf + nc, endp - (uint8_t *)(cigf + nc)); + b->l_data -= 4*(nc-j); + } + } + return 0; } diff --git a/samtools/bam_reheader.c b/samtools/bam_reheader.c index f84c8053..a1e3ae1a 100644 --- a/samtools/bam_reheader.c +++ b/samtools/bam_reheader.c @@ -1,7 +1,7 @@ /* bam_reheader.c -- reheader subcommand. Copyright (C) 2010 Broad Institute. - Copyright (C) 2012-2019 Genome Research Ltd. + Copyright (C) 2012-2019, 2025 Genome Research Ltd. Author: Heng Li @@ -30,6 +30,9 @@ DEALINGS IN THE SOFTWARE. */ #include #include #include +#ifdef _WIN32 +#include // GetTempPath +#endif #include "htslib/bgzf.h" #include "htslib/sam.h" @@ -458,7 +461,21 @@ static sam_hdr_t* external_reheader(samFile* in, const char* external) { fprintf(stderr, "[%s] failed to read the header for '%s'.\n", __func__, in->fn); return NULL; } - char tmp_fn[] = "reheaderXXXXXX"; + char tmp_fn[1024+15]; + char *env = getenv("TMPDIR"); +#ifdef _WIN32 + char tmp_path[MAX_PATH]; + if (!env) { + int ret = GetTempPath(MAX_PATH, tmp_path); + if (!ret || ret > MAX_PATH) + strcpy(tmp_path, "./"); + env = tmp_path; + } +#else + if (!env) + env = "/tmp"; +#endif + snprintf(tmp_fn, sizeof(tmp_fn), "%s/reheaderXXXXXX", env); int tmp_fd = mkstemp(tmp_fn); if (tmp_fd < 0) { print_error_errno("reheader", "fail to open temp file '%s'", tmp_fn); diff --git a/samtools/bam_reheader.c.pysam.c b/samtools/bam_reheader.c.pysam.c index 5a78c661..4421aca9 100644 --- a/samtools/bam_reheader.c.pysam.c +++ b/samtools/bam_reheader.c.pysam.c @@ -3,7 +3,7 @@ /* bam_reheader.c -- reheader subcommand. Copyright (C) 2010 Broad Institute. - Copyright (C) 2012-2019 Genome Research Ltd. + Copyright (C) 2012-2019, 2025 Genome Research Ltd. Author: Heng Li @@ -32,6 +32,9 @@ DEALINGS IN THE SOFTWARE. */ #include #include #include +#ifdef _WIN32 +#include // GetTempPath +#endif #include "htslib/bgzf.h" #include "htslib/sam.h" @@ -460,7 +463,21 @@ static sam_hdr_t* external_reheader(samFile* in, const char* external) { fprintf(samtools_stderr, "[%s] failed to read the header for '%s'.\n", __func__, in->fn); return NULL; } - char tmp_fn[] = "reheaderXXXXXX"; + char tmp_fn[1024+15]; + char *env = getenv("TMPDIR"); +#ifdef _WIN32 + char tmp_path[MAX_PATH]; + if (!env) { + int ret = GetTempPath(MAX_PATH, tmp_path); + if (!ret || ret > MAX_PATH) + strcpy(tmp_path, "./"); + env = tmp_path; + } +#else + if (!env) + env = "/tmp"; +#endif + snprintf(tmp_fn, sizeof(tmp_fn), "%s/reheaderXXXXXX", env); int tmp_fd = mkstemp(tmp_fn); if (tmp_fd < 0) { print_error_errno("reheader", "fail to open temp file '%s'", tmp_fn); diff --git a/samtools/bam_sort.c b/samtools/bam_sort.c index 2ff6f7e9..28f7c2b0 100644 --- a/samtools/bam_sort.c +++ b/samtools/bam_sort.c @@ -1,6 +1,6 @@ /* bam_sort.c -- sorting and merging. - Copyright (C) 2008-2024 Genome Research Ltd. + Copyright (C) 2008-2025 Genome Research Ltd. Portions copyright (C) 2009-2012 Broad Institute. Author: Heng Li @@ -470,7 +470,14 @@ static int trans_tbl_add_sq(merged_header_t* merged_hdr, sam_hdr_t *translate, // Fill in the tid part of the translation table, adding new targets // to the merged header as we go. - for (i = 0; i < sam_hdr_nref(translate); ++i) { + tbl->n_targets = sam_hdr_nref(translate); + tbl->tid_trans = calloc(tbl->n_targets ? tbl->n_targets : 1, sizeof(int)); + if (tbl->tid_trans == NULL) { + print_error_errno("merge", "failed to allocate @SQ translation table"); + return -1; + } + + for (i = 0; i < tbl->n_targets; ++i) { int trans_tid; sq_sn.l = 0; res = sam_hdr_find_tag_pos(translate, "SQ", i, "SN", &sq_sn); @@ -797,11 +804,9 @@ static int trans_tbl_init(merged_header_t* merged_hdr, sam_hdr_t* translate, klist_t(hdrln) *rg_list = NULL; klist_t(hdrln) *pg_list = NULL; - tbl->n_targets = sam_hdr_nref(translate); + tbl->n_targets = 0; + tbl->tid_trans = NULL; tbl->rg_trans = tbl->pg_trans = NULL; - tbl->tid_trans = (int*)calloc(tbl->n_targets ? tbl->n_targets : 1, - sizeof(int)); - if (tbl->tid_trans == NULL) goto memfail; tbl->rg_trans = kh_init(c2c); if (tbl->rg_trans == NULL) goto memfail; tbl->pg_trans = kh_init(c2c); @@ -867,7 +872,20 @@ static int trans_tbl_init(merged_header_t* merged_hdr, sam_hdr_t* translate, return -1; } -static int finish_merged_header(merged_header_t *merged_hdr) { +static int finish_merged_header(merged_header_t *merged_hdr, int resetorder) { + if (resetorder && merged_hdr->have_hd) { + //reset sort order to unsorted and remove GO, SS + if (sam_hdr_remove_tag_id(merged_hdr->hdr, "HD", NULL, NULL, "SS") < 0) { + return -1; + } + if (sam_hdr_remove_tag_id(merged_hdr->hdr, "HD", NULL, NULL, "GO") < 0) { + return -1; + } + if (sam_hdr_update_hd(merged_hdr->hdr, "SO", "unsorted") < 0) { + return -1; + } + } + if (sam_hdr_add_lines(merged_hdr->hdr, ks_c_str(&merged_hdr->out_rg), ks_len(&merged_hdr->out_rg)) < 0) return -1; @@ -1109,15 +1127,16 @@ int bam_merge_core2(SamOrder sam_order, char* sort_tag, const char *out, const c hts_itr_t **iter = NULL; sam_hdr_t **hdr = NULL; trans_tbl_t *translation_tbl = NULL; - int *rtrans = NULL; + int *rtrans = NULL, resetorder = 0; char *out_idx_fn = NULL; void *hreg = NULL; hts_reglist_t *lreg = NULL; merged_header_t *merged_hdr = init_merged_header(); if (!merged_hdr) return -1; - refs_t *refs = NULL; + refs_t *refs = NULL, *refs_out = NULL; template_coordinate_keys_t *keys = NULL; khash_t(const_c2c) *lib_lookup = NULL; + int refs_out_shared = 1; // Is there a specified pre-prepared header to use for output? if (headers) { @@ -1207,16 +1226,38 @@ int bam_merge_core2(SamOrder sam_order, char* sort_tag, const char *out, const c int order_ok = 1; if ((translation_tbl+i)->lost_coord_sort && (sam_order == Coordinate || sam_order == MinHash)) { fprintf(stderr, "[bam_merge_core] Order of targets in file %s caused coordinate sort to be lost\n", fn[i]); - order_ok = 0; + refs_out_shared = order_ok = 0; + resetorder = 1; + } + + // Check our translated TIDs for fp[i] and fp[0] match. + if (i > 0) { + if (translation_tbl[i].n_targets != translation_tbl[0].n_targets + || memcmp(translation_tbl[0].tid_trans, + translation_tbl[i].tid_trans, + translation_tbl[0].n_targets * + sizeof(*translation_tbl[0].tid_trans)) != 0) + refs_out_shared = order_ok = 0; } - if (!refs) - refs = cram_get_refs(fp[i]); + if (order_ok) { + if (!refs) + refs = cram_get_refs(fp[i]); + if (!refs_out) + refs_out = refs; - if (order_ok && refs && hts_set_opt(fp[i], CRAM_OPT_SHARED_REF, refs)) - goto fail; + if (refs && hts_set_opt(fp[i], CRAM_OPT_SHARED_REF, refs)) + goto fail; + } else { + refs = NULL; + } } + // We can share refs between compatible input files, but if any input is + // incompatible then so will sharing a ref with the output. + if (!refs_out_shared) + refs_out = NULL; + // Did we get an @HD line? if (!merged_hdr->have_hd) { fprintf(stderr, "[W::%s] No @HD tag found.\n", __func__); @@ -1229,7 +1270,7 @@ int bam_merge_core2(SamOrder sam_order, char* sort_tag, const char *out, const c } // Transform the header into standard form - if (finish_merged_header(merged_hdr) < 0) + if (finish_merged_header(merged_hdr, resetorder) < 0) goto fail; hout = merged_hdr->hdr; @@ -1410,7 +1451,7 @@ int bam_merge_core2(SamOrder sam_order, char* sort_tag, const char *out, const c } if (!(flag & MERGE_UNCOMP)) hts_set_threads(fpout, n_threads); - if (refs && hts_set_opt(fpout, CRAM_OPT_SHARED_REF, refs)) + if (refs_out && hts_set_opt(fpout, CRAM_OPT_SHARED_REF, refs_out)) goto fail; // Begin the actual merge @@ -3191,6 +3232,104 @@ static khash_t(const_c2c) * lookup_libraries(sam_hdr_t *header) return NULL; } +// Updates header fields, adding the header if absent. +// Done as a macro instead of a function as we don't have va_list versions of +// these functions. +#define sam_hdr_update_sort(h, ...) ( \ + (-1 == sam_hdr_update_line((h), "HD", NULL, NULL, __VA_ARGS__, NULL) && \ + -1 == sam_hdr_add_line((h), "HD", "VN", SAM_FORMAT_VERSION, \ + __VA_ARGS__, NULL)) \ + ? -1 : 0) + +/* + * Sets the header sort order, group order and sub sort fields. + * Returns 0 on success + * -1 on failure + */ +static int set_sort_order(sam_hdr_t *h, int mapped) { + const char *new_so = NULL; + const char *new_go = NULL; + const char *new_ss = NULL; + + switch (g_sam_order) { + case Coordinate: + new_so = "coordinate"; + break; + case QueryName: + new_so = "queryname"; + new_ss = natural_sort + ? "queryname:natural" + : "queryname:lexicographical"; + break; + case MinHash: + new_so = mapped + ? "coordinate" + : "unsorted"; + new_ss = mapped + ? "coordinate:minhash" + : "unsorted:minhash"; + break; + case TagQueryName: + case TagCoordinate: + new_so = "unknown"; + break; + case TemplateCoordinate: + new_so = "unsorted"; + new_go = "query"; + new_ss = "unsorted:template-coordinate"; + break; + default: + new_so = "unknown"; + break; + } + + // Add or update HD + if (!new_ss && !new_go) { + // SO only + if (sam_hdr_update_sort(h, "SO", new_so) == -1) { + print_error("sort", "failed to change sort order header to " + "'SO:%s'\n", new_so); + return -1; + } + } else if (new_ss && !new_go) { + // SO and SS + if (sam_hdr_update_sort(h, "SO", new_so, "SS", new_ss) == -1) { + print_error("sort", "failed to change sort order header to " + "'SO:%s SS:%s'\n", new_so, new_ss); + return -1; + } + } else if (!new_ss && new_go) { + // SO and GO + if (sam_hdr_update_sort(h, "SO", new_so, "GO", new_go) == -1) { + print_error("sort", "failed to change sort order header to " + "'SO:%s GO:%s'\n", new_so, new_go); + return -1; + } + } else { + // SO, GO and SS + if (sam_hdr_update_sort(h, "SO", new_so, "GO", new_go, + "SS", new_ss) == -1) { + print_error("sort", "failed to change sort order header to " + "'SO:%s GO:%s SS:%s'\n", new_so, new_go, new_ss); + return -1; + } + } + + // Remove old HD entries + if (!new_go && sam_hdr_remove_tag_hd(h, "GO") == -1) { + print_error("sort", "failed to delete group order in header\n"); + return -1; + } + + if (!new_ss && sam_hdr_remove_tag_hd(h, "SS") == -1) { + print_error("sort", "failed to delete sub sort in header\n"); + return -1; + } + + return 0; +} + + /*! @abstract Sort an unsorted BAM file based on the provided sort order @@ -3231,9 +3370,6 @@ int bam_sort_core_ext(SamOrder sam_order, char* sort_tag, int minimiser_kmer, uint8_t *bam_mem = NULL; char **fns = NULL; size_t fns_size = 0; - const char *new_so = NULL; - const char *new_go = NULL; - const char *new_ss = NULL; buf_region *in_mem = NULL; khash_t(const_c2c) *lib_lookup = NULL; htsThreadPool htspool = { NULL, 0 }; @@ -3305,83 +3441,6 @@ int bam_sort_core_ext(SamOrder sam_order, char* sort_tag, int minimiser_kmer, goto err; } - switch (g_sam_order) { - case Coordinate: - new_so = "coordinate"; - break; - case QueryName: - new_so = "queryname"; - new_ss = natural_sort - ? "queryname:natural" - : "queryname:lexicographical"; - break; - case MinHash: - new_so = "coordinate"; - new_ss = "coordinate:minhash"; - break; - case TagQueryName: - case TagCoordinate: - new_so = "unknown"; - break; - case TemplateCoordinate: - new_so = "unsorted"; - new_go = "query"; - new_ss = "unsorted:template-coordinate"; - break; - default: - new_so = "unknown"; - break; - } - - if (new_ss == NULL && new_go == NULL) { // just SO - if ((-1 == sam_hdr_update_hd(header, "SO", new_so)) - && (-1 == sam_hdr_add_line(header, "HD", "VN", SAM_FORMAT_VERSION, "SO", new_so, NULL)) - ) { - print_error("sort", "failed to change sort order header to 'SO:%s'\n", new_so); - goto err; - } - } else if (new_ss != NULL && new_go == NULL) { // update SO and SS, but not GO - if ((-1 == sam_hdr_update_hd(header, "SO", new_so, "SS", new_ss)) - && (-1 == sam_hdr_add_line(header, "HD", "VN", SAM_FORMAT_VERSION, - "SO", new_so, "SS", new_ss, NULL)) - ) { - print_error("sort", "failed to change sort order header to 'SO:%s SS:%s'\n", - new_so, new_ss); - goto err; - } - } else if (new_ss == NULL && new_go != NULL) { // update SO and GO, but not SS - if ((-1 == sam_hdr_update_hd(header, "SO", new_so, "GO", new_go)) - && (-1 == sam_hdr_add_line(header, "HD", "VN", SAM_FORMAT_VERSION, - "SO", new_so, "GO", new_go, NULL)) - ) { - print_error("sort", "failed to change sort order header to 'SO:%s GO:%s'\n", - new_so, new_go); - goto err; - } - } else { // update SO, GO, and SS - if ((-1 == sam_hdr_update_hd(header, "SO", new_so, "GO", new_go, "SS", new_ss)) - && (-1 == sam_hdr_add_line(header, "HD", "VN", SAM_FORMAT_VERSION, - "SO", new_so, "GO", new_go, "SS", new_ss, NULL)) - ) { - print_error("sort", "failed to change sort order header to 'SO:%s GO:%s SS:%s'\n", - new_so, new_go, new_ss); - goto err; - } - } - - if (new_go == NULL) { - if (-1 == sam_hdr_remove_tag_hd(header, "GO")) { - print_error("sort", "failed to delete group order in header\n"); - goto err; - } - } - if (new_ss == NULL) { - if (-1 == sam_hdr_remove_tag_hd(header, "SS")) { - print_error("sort", "failed to delete sub sort in header\n"); - goto err; - } - } - if (n_threads > 1) { htspool.pool = hts_tpool_init(n_threads); if (!htspool.pool) { @@ -3402,9 +3461,12 @@ int bam_sort_core_ext(SamOrder sam_order, char* sort_tag, int minimiser_kmer, // write sub files k = max_k = bam_mem_offset = 0; size_t name_len = strlen(prefix) + 30; + int placed = 0; while ((res = sam_read1(fp, header, b)) >= 0) { int mem_full = 0; + placed |= b->core.tid >= 0; + if (k == max_k) { bam1_tag *new_buf; max_k = max_k? max_k<<1 : 0x10000; @@ -3540,6 +3602,10 @@ int bam_sort_core_ext(SamOrder sam_order, char* sort_tag, int minimiser_kmer, num_in_mem = 0; } + // Set the order here as we need to know if entirely unmapped. + if (set_sort_order(header, placed) < 0) + goto err; + // write the final output if (n_files == 0 && num_in_mem < 2) { // a single block if (write_buffer(fnout, modeout, k, buf, header, n_threads, out_fmt, @@ -3548,9 +3614,12 @@ int bam_sort_core_ext(SamOrder sam_order, char* sort_tag, int minimiser_kmer, goto err; } } else { // then merge - fprintf(stderr, + if (hts_get_log_level() >= 2) { // 2 is between the WARNING (default) and ERROR levels + fprintf(stderr, "[bam_sort_core] merging from %d files and %d in-memory blocks...\n", n_files, num_in_mem); + } + // Paranoia check - all temporary files should have a name for (i = 0; i < n_files; ++i) { if (!fns[i]) { @@ -3761,7 +3830,7 @@ int bam_sort(int argc, char *argv[]) goto sort_end; } - if (ga.write_index && sam_order != Coordinate) { + if (ga.write_index && sam_order != Coordinate && sam_order != MinHash) { fprintf(stderr, "[W::bam_sort] Ignoring --write-index as it only works for position sorted files.\n"); ga.write_index = 0; } diff --git a/samtools/bam_sort.c.pysam.c b/samtools/bam_sort.c.pysam.c index 3aa3a49a..46687648 100644 --- a/samtools/bam_sort.c.pysam.c +++ b/samtools/bam_sort.c.pysam.c @@ -2,7 +2,7 @@ /* bam_sort.c -- sorting and merging. - Copyright (C) 2008-2024 Genome Research Ltd. + Copyright (C) 2008-2025 Genome Research Ltd. Portions copyright (C) 2009-2012 Broad Institute. Author: Heng Li @@ -472,7 +472,14 @@ static int trans_tbl_add_sq(merged_header_t* merged_hdr, sam_hdr_t *translate, // Fill in the tid part of the translation table, adding new targets // to the merged header as we go. - for (i = 0; i < sam_hdr_nref(translate); ++i) { + tbl->n_targets = sam_hdr_nref(translate); + tbl->tid_trans = calloc(tbl->n_targets ? tbl->n_targets : 1, sizeof(int)); + if (tbl->tid_trans == NULL) { + print_error_errno("merge", "failed to allocate @SQ translation table"); + return -1; + } + + for (i = 0; i < tbl->n_targets; ++i) { int trans_tid; sq_sn.l = 0; res = sam_hdr_find_tag_pos(translate, "SQ", i, "SN", &sq_sn); @@ -799,11 +806,9 @@ static int trans_tbl_init(merged_header_t* merged_hdr, sam_hdr_t* translate, klist_t(hdrln) *rg_list = NULL; klist_t(hdrln) *pg_list = NULL; - tbl->n_targets = sam_hdr_nref(translate); + tbl->n_targets = 0; + tbl->tid_trans = NULL; tbl->rg_trans = tbl->pg_trans = NULL; - tbl->tid_trans = (int*)calloc(tbl->n_targets ? tbl->n_targets : 1, - sizeof(int)); - if (tbl->tid_trans == NULL) goto memfail; tbl->rg_trans = kh_init(c2c); if (tbl->rg_trans == NULL) goto memfail; tbl->pg_trans = kh_init(c2c); @@ -869,7 +874,20 @@ static int trans_tbl_init(merged_header_t* merged_hdr, sam_hdr_t* translate, return -1; } -static int finish_merged_header(merged_header_t *merged_hdr) { +static int finish_merged_header(merged_header_t *merged_hdr, int resetorder) { + if (resetorder && merged_hdr->have_hd) { + //reset sort order to unsorted and remove GO, SS + if (sam_hdr_remove_tag_id(merged_hdr->hdr, "HD", NULL, NULL, "SS") < 0) { + return -1; + } + if (sam_hdr_remove_tag_id(merged_hdr->hdr, "HD", NULL, NULL, "GO") < 0) { + return -1; + } + if (sam_hdr_update_hd(merged_hdr->hdr, "SO", "unsorted") < 0) { + return -1; + } + } + if (sam_hdr_add_lines(merged_hdr->hdr, ks_c_str(&merged_hdr->out_rg), ks_len(&merged_hdr->out_rg)) < 0) return -1; @@ -1111,15 +1129,16 @@ int bam_merge_core2(SamOrder sam_order, char* sort_tag, const char *out, const c hts_itr_t **iter = NULL; sam_hdr_t **hdr = NULL; trans_tbl_t *translation_tbl = NULL; - int *rtrans = NULL; + int *rtrans = NULL, resetorder = 0; char *out_idx_fn = NULL; void *hreg = NULL; hts_reglist_t *lreg = NULL; merged_header_t *merged_hdr = init_merged_header(); if (!merged_hdr) return -1; - refs_t *refs = NULL; + refs_t *refs = NULL, *refs_out = NULL; template_coordinate_keys_t *keys = NULL; khash_t(const_c2c) *lib_lookup = NULL; + int refs_out_shared = 1; // Is there a specified pre-prepared header to use for output? if (headers) { @@ -1209,16 +1228,38 @@ int bam_merge_core2(SamOrder sam_order, char* sort_tag, const char *out, const c int order_ok = 1; if ((translation_tbl+i)->lost_coord_sort && (sam_order == Coordinate || sam_order == MinHash)) { fprintf(samtools_stderr, "[bam_merge_core] Order of targets in file %s caused coordinate sort to be lost\n", fn[i]); - order_ok = 0; + refs_out_shared = order_ok = 0; + resetorder = 1; + } + + // Check our translated TIDs for fp[i] and fp[0] match. + if (i > 0) { + if (translation_tbl[i].n_targets != translation_tbl[0].n_targets + || memcmp(translation_tbl[0].tid_trans, + translation_tbl[i].tid_trans, + translation_tbl[0].n_targets * + sizeof(*translation_tbl[0].tid_trans)) != 0) + refs_out_shared = order_ok = 0; } - if (!refs) - refs = cram_get_refs(fp[i]); + if (order_ok) { + if (!refs) + refs = cram_get_refs(fp[i]); + if (!refs_out) + refs_out = refs; - if (order_ok && refs && hts_set_opt(fp[i], CRAM_OPT_SHARED_REF, refs)) - goto fail; + if (refs && hts_set_opt(fp[i], CRAM_OPT_SHARED_REF, refs)) + goto fail; + } else { + refs = NULL; + } } + // We can share refs between compatible input files, but if any input is + // incompatible then so will sharing a ref with the output. + if (!refs_out_shared) + refs_out = NULL; + // Did we get an @HD line? if (!merged_hdr->have_hd) { fprintf(samtools_stderr, "[W::%s] No @HD tag found.\n", __func__); @@ -1231,7 +1272,7 @@ int bam_merge_core2(SamOrder sam_order, char* sort_tag, const char *out, const c } // Transform the header into standard form - if (finish_merged_header(merged_hdr) < 0) + if (finish_merged_header(merged_hdr, resetorder) < 0) goto fail; hout = merged_hdr->hdr; @@ -1412,7 +1453,7 @@ int bam_merge_core2(SamOrder sam_order, char* sort_tag, const char *out, const c } if (!(flag & MERGE_UNCOMP)) hts_set_threads(fpout, n_threads); - if (refs && hts_set_opt(fpout, CRAM_OPT_SHARED_REF, refs)) + if (refs_out && hts_set_opt(fpout, CRAM_OPT_SHARED_REF, refs_out)) goto fail; // Begin the actual merge @@ -3193,6 +3234,104 @@ static khash_t(const_c2c) * lookup_libraries(sam_hdr_t *header) return NULL; } +// Updates header fields, adding the header if absent. +// Done as a macro instead of a function as we don't have va_list versions of +// these functions. +#define sam_hdr_update_sort(h, ...) ( \ + (-1 == sam_hdr_update_line((h), "HD", NULL, NULL, __VA_ARGS__, NULL) && \ + -1 == sam_hdr_add_line((h), "HD", "VN", SAM_FORMAT_VERSION, \ + __VA_ARGS__, NULL)) \ + ? -1 : 0) + +/* + * Sets the header sort order, group order and sub sort fields. + * Returns 0 on success + * -1 on failure + */ +static int set_sort_order(sam_hdr_t *h, int mapped) { + const char *new_so = NULL; + const char *new_go = NULL; + const char *new_ss = NULL; + + switch (g_sam_order) { + case Coordinate: + new_so = "coordinate"; + break; + case QueryName: + new_so = "queryname"; + new_ss = natural_sort + ? "queryname:natural" + : "queryname:lexicographical"; + break; + case MinHash: + new_so = mapped + ? "coordinate" + : "unsorted"; + new_ss = mapped + ? "coordinate:minhash" + : "unsorted:minhash"; + break; + case TagQueryName: + case TagCoordinate: + new_so = "unknown"; + break; + case TemplateCoordinate: + new_so = "unsorted"; + new_go = "query"; + new_ss = "unsorted:template-coordinate"; + break; + default: + new_so = "unknown"; + break; + } + + // Add or update HD + if (!new_ss && !new_go) { + // SO only + if (sam_hdr_update_sort(h, "SO", new_so) == -1) { + print_error("sort", "failed to change sort order header to " + "'SO:%s'\n", new_so); + return -1; + } + } else if (new_ss && !new_go) { + // SO and SS + if (sam_hdr_update_sort(h, "SO", new_so, "SS", new_ss) == -1) { + print_error("sort", "failed to change sort order header to " + "'SO:%s SS:%s'\n", new_so, new_ss); + return -1; + } + } else if (!new_ss && new_go) { + // SO and GO + if (sam_hdr_update_sort(h, "SO", new_so, "GO", new_go) == -1) { + print_error("sort", "failed to change sort order header to " + "'SO:%s GO:%s'\n", new_so, new_go); + return -1; + } + } else { + // SO, GO and SS + if (sam_hdr_update_sort(h, "SO", new_so, "GO", new_go, + "SS", new_ss) == -1) { + print_error("sort", "failed to change sort order header to " + "'SO:%s GO:%s SS:%s'\n", new_so, new_go, new_ss); + return -1; + } + } + + // Remove old HD entries + if (!new_go && sam_hdr_remove_tag_hd(h, "GO") == -1) { + print_error("sort", "failed to delete group order in header\n"); + return -1; + } + + if (!new_ss && sam_hdr_remove_tag_hd(h, "SS") == -1) { + print_error("sort", "failed to delete sub sort in header\n"); + return -1; + } + + return 0; +} + + /*! @abstract Sort an unsorted BAM file based on the provided sort order @@ -3233,9 +3372,6 @@ int bam_sort_core_ext(SamOrder sam_order, char* sort_tag, int minimiser_kmer, uint8_t *bam_mem = NULL; char **fns = NULL; size_t fns_size = 0; - const char *new_so = NULL; - const char *new_go = NULL; - const char *new_ss = NULL; buf_region *in_mem = NULL; khash_t(const_c2c) *lib_lookup = NULL; htsThreadPool htspool = { NULL, 0 }; @@ -3307,83 +3443,6 @@ int bam_sort_core_ext(SamOrder sam_order, char* sort_tag, int minimiser_kmer, goto err; } - switch (g_sam_order) { - case Coordinate: - new_so = "coordinate"; - break; - case QueryName: - new_so = "queryname"; - new_ss = natural_sort - ? "queryname:natural" - : "queryname:lexicographical"; - break; - case MinHash: - new_so = "coordinate"; - new_ss = "coordinate:minhash"; - break; - case TagQueryName: - case TagCoordinate: - new_so = "unknown"; - break; - case TemplateCoordinate: - new_so = "unsorted"; - new_go = "query"; - new_ss = "unsorted:template-coordinate"; - break; - default: - new_so = "unknown"; - break; - } - - if (new_ss == NULL && new_go == NULL) { // just SO - if ((-1 == sam_hdr_update_hd(header, "SO", new_so)) - && (-1 == sam_hdr_add_line(header, "HD", "VN", SAM_FORMAT_VERSION, "SO", new_so, NULL)) - ) { - print_error("sort", "failed to change sort order header to 'SO:%s'\n", new_so); - goto err; - } - } else if (new_ss != NULL && new_go == NULL) { // update SO and SS, but not GO - if ((-1 == sam_hdr_update_hd(header, "SO", new_so, "SS", new_ss)) - && (-1 == sam_hdr_add_line(header, "HD", "VN", SAM_FORMAT_VERSION, - "SO", new_so, "SS", new_ss, NULL)) - ) { - print_error("sort", "failed to change sort order header to 'SO:%s SS:%s'\n", - new_so, new_ss); - goto err; - } - } else if (new_ss == NULL && new_go != NULL) { // update SO and GO, but not SS - if ((-1 == sam_hdr_update_hd(header, "SO", new_so, "GO", new_go)) - && (-1 == sam_hdr_add_line(header, "HD", "VN", SAM_FORMAT_VERSION, - "SO", new_so, "GO", new_go, NULL)) - ) { - print_error("sort", "failed to change sort order header to 'SO:%s GO:%s'\n", - new_so, new_go); - goto err; - } - } else { // update SO, GO, and SS - if ((-1 == sam_hdr_update_hd(header, "SO", new_so, "GO", new_go, "SS", new_ss)) - && (-1 == sam_hdr_add_line(header, "HD", "VN", SAM_FORMAT_VERSION, - "SO", new_so, "GO", new_go, "SS", new_ss, NULL)) - ) { - print_error("sort", "failed to change sort order header to 'SO:%s GO:%s SS:%s'\n", - new_so, new_go, new_ss); - goto err; - } - } - - if (new_go == NULL) { - if (-1 == sam_hdr_remove_tag_hd(header, "GO")) { - print_error("sort", "failed to delete group order in header\n"); - goto err; - } - } - if (new_ss == NULL) { - if (-1 == sam_hdr_remove_tag_hd(header, "SS")) { - print_error("sort", "failed to delete sub sort in header\n"); - goto err; - } - } - if (n_threads > 1) { htspool.pool = hts_tpool_init(n_threads); if (!htspool.pool) { @@ -3404,9 +3463,12 @@ int bam_sort_core_ext(SamOrder sam_order, char* sort_tag, int minimiser_kmer, // write sub files k = max_k = bam_mem_offset = 0; size_t name_len = strlen(prefix) + 30; + int placed = 0; while ((res = sam_read1(fp, header, b)) >= 0) { int mem_full = 0; + placed |= b->core.tid >= 0; + if (k == max_k) { bam1_tag *new_buf; max_k = max_k? max_k<<1 : 0x10000; @@ -3542,6 +3604,10 @@ int bam_sort_core_ext(SamOrder sam_order, char* sort_tag, int minimiser_kmer, num_in_mem = 0; } + // Set the order here as we need to know if entirely unmapped. + if (set_sort_order(header, placed) < 0) + goto err; + // write the final output if (n_files == 0 && num_in_mem < 2) { // a single block if (write_buffer(fnout, modeout, k, buf, header, n_threads, out_fmt, @@ -3550,9 +3616,12 @@ int bam_sort_core_ext(SamOrder sam_order, char* sort_tag, int minimiser_kmer, goto err; } } else { // then merge - fprintf(samtools_stderr, + if (hts_get_log_level() >= 2) { // 2 is between the WARNING (default) and ERROR levels + fprintf(samtools_stderr, "[bam_sort_core] merging from %d files and %d in-memory blocks...\n", n_files, num_in_mem); + } + // Paranoia check - all temporary files should have a name for (i = 0; i < n_files; ++i) { if (!fns[i]) { @@ -3763,7 +3832,7 @@ int bam_sort(int argc, char *argv[]) goto sort_end; } - if (ga.write_index && sam_order != Coordinate) { + if (ga.write_index && sam_order != Coordinate && sam_order != MinHash) { fprintf(samtools_stderr, "[W::bam_sort] Ignoring --write-index as it only works for position sorted files.\n"); ga.write_index = 0; } diff --git a/samtools/bamtk.c b/samtools/bamtk.c index 8c330bc2..9f54c887 100644 --- a/samtools/bamtk.c +++ b/samtools/bamtk.c @@ -74,6 +74,7 @@ int main_consensus(int argc, char *argv[]); int main_reference(int argc, char *argv[]); int main_reset(int argc, char *argv[]); int main_cram_size(int argc, char *argv[]); +int main_checksum(int argc, char *argv[]); const char *samtools_version(void) { @@ -103,7 +104,7 @@ const char *samtools_feature_string(void) { static void long_version(void) { printf("samtools %s\n" "Using htslib %s\n" - "Copyright (C) 2024 Genome Research Ltd.\n", + "Copyright (C) 2025 Genome Research Ltd.\n", samtools_version(), hts_version()); printf("\nSamtools compilation details:\n"); @@ -195,6 +196,7 @@ static void usage(FILE *fp) " phase phase heterozygotes\n" " stats generate stats (former bamcheck)\n" " ampliconstats generate amplicon specific stats\n" +" checksum produce order-agnostic checksums of sequence content\n" "\n" " -- Viewing\n" " flags explain BAM flags\n" @@ -292,6 +294,7 @@ int main(int argc, char *argv[]) else if (strcmp(argv[1], "consensus") == 0) ret = main_consensus(argc-1, argv+1); else if (strcmp(argv[1], "reference") == 0) ret = main_reference(argc-1, argv+1); else if (strcmp(argv[1], "cram-size") == 0) ret = main_cram_size(argc-1, argv+1); + else if (strcmp(argv[1], "checksum") == 0) ret = main_checksum(argc-1, argv+1); else if (strcmp(argv[1], "version") == 0 || \ strcmp(argv[1], "--version") == 0) long_version(); diff --git a/samtools/bamtk.c.pysam.c b/samtools/bamtk.c.pysam.c index a76729ce..340a0cdc 100644 --- a/samtools/bamtk.c.pysam.c +++ b/samtools/bamtk.c.pysam.c @@ -77,6 +77,7 @@ int main_consensus(int argc, char *argv[]); int main_reference(int argc, char *argv[]); int main_reset(int argc, char *argv[]); int main_cram_size(int argc, char *argv[]); +int main_checksum(int argc, char *argv[]); const char *samtools_version(void) { @@ -106,7 +107,7 @@ const char *samtools_feature_string(void) { static void long_version(void) { fprintf(samtools_stdout, "samtools %s\n" "Using htslib %s\n" - "Copyright (C) 2024 Genome Research Ltd.\n", + "Copyright (C) 2025 Genome Research Ltd.\n", samtools_version(), hts_version()); fprintf(samtools_stdout, "\nSamtools compilation details:\n"); @@ -198,6 +199,7 @@ static void usage(FILE *fp) " phase phase heterozygotes\n" " stats generate stats (former bamcheck)\n" " ampliconstats generate amplicon specific stats\n" +" checksum produce order-agnostic checksums of sequence content\n" "\n" " -- Viewing\n" " flags explain BAM flags\n" @@ -295,6 +297,7 @@ int samtools_main(int argc, char *argv[]) else if (strcmp(argv[1], "consensus") == 0) ret = main_consensus(argc-1, argv+1); else if (strcmp(argv[1], "reference") == 0) ret = main_reference(argc-1, argv+1); else if (strcmp(argv[1], "cram-size") == 0) ret = main_cram_size(argc-1, argv+1); + else if (strcmp(argv[1], "checksum") == 0) ret = main_checksum(argc-1, argv+1); else if (strcmp(argv[1], "version") == 0 || \ strcmp(argv[1], "--version") == 0) long_version(); diff --git a/samtools/bedcov.c b/samtools/bedcov.c index 10eeface..f314c3b0 100644 --- a/samtools/bedcov.c +++ b/samtools/bedcov.c @@ -51,13 +51,16 @@ typedef struct { int64_t rcnt; } aux_t; +#define HDR_CHROM "#chrom\t" + static int read_bam(void *data, bam1_t *b) { aux_t *aux = (aux_t*)data; // data in fact is a pointer to an auxiliary structure int ret; while (1) { - ret = aux->iter? sam_itr_next(aux->fp, aux->iter, b) : sam_read1(aux->fp, aux->header, b); + ret = aux->iter? sam_itr_next(aux->fp, aux->iter, b) : + sam_read1(aux->fp, aux->header, b); if ( ret<0 ) break; if ( b->core.flag & aux->flags ) continue; if ( (int)b->core.qual < aux->min_mapQ ) continue; @@ -72,6 +75,50 @@ static int incr_rcnt(void *data, const bam1_t *b, bam_pileup_cd *cd) { return 0; } +/// output_header - dump the header in output +/** @param fp - pointer to output file +* @param hdr - header from bed file, when it has one +* @param fields - field count to fill with \t when it doesn't have a header +* @param filecount - no. of input files +* @param argv - input files, for header naming +* @param depth - depth threshold configuration +* @param rcount - show read count configuration +* returns nothing +*/ +static void output_header(FILE *fp, char *hdr, int fields, int filecount, + char *argv[], int depth, int rcount) +{ + int i = 0; + if (hdr) { //header available from bed file + fprintf(fp, "%s", hdr); + } else { + /* no header in bed, add one. add headers as defined in format. + use empty header with tab separation for fields above those defined in + format */ + char *bedcols[] = { "chrom", "chromStart", "chromEnd", "name", "score", + "strand", "thickStart", "thickEnd", "itemRgb", "blockCount", + "blockSizes", "blockStarts"}; + for (i = 0; i < fields; ++i) { + fprintf(fp, "%s%s", (i ? "\t" : "#"), + (i < sizeof(bedcols)/sizeof(bedcols[i]) ? bedcols[i] : ".")); + } + } + for (i = 0; i < filecount; ++i) { //coverage header + fprintf(fp, "\t%s_cov", argv[i + optind + 1]); + } + if (depth >= 0) { //depth header + for (i = 0; i < filecount; ++i) { + fprintf(fp, "\t%s_depth", argv[i + optind + 1]); + } + } + if (rcount) { //read count header + for (i = 0; i < filecount; ++i) { + fprintf(fp, "\t%s_count", argv[i + optind + 1]); + } + } + fprintf(fp, "\n"); +} + int main_bedcov(int argc, char *argv[]) { gzFile fp; @@ -85,7 +132,8 @@ int main_bedcov(int argc, char *argv[]) const bam_pileup1_t **plp; int usage = 0, has_index_file = 0; uint32_t flags = (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP); - int tflags = 0, min_depth = -1, max_depth = DEFAULT_DEPTH, print_header=0; + int tflags = 0, min_depth = -1, max_depth = DEFAULT_DEPTH, print_header=0, + hdr = 0; sam_global_args ga = SAM_GLOBAL_ARGS_INIT; static const struct option lopts[] = { @@ -154,6 +202,9 @@ int main_bedcov(int argc, char *argv[]) n = argc - optind - 1; } + if (!print_header) { // no header output needed, avoid check for header line + hdr = 1; + } memset(&str, 0, sizeof(kstring_t)); aux = calloc(n, sizeof(aux_t*)); idx = calloc(n, sizeof(hts_idx_t*)); @@ -196,22 +247,6 @@ int main_bedcov(int argc, char *argv[]) return 2; } - if (print_header) { - printf("#chrom\tstart\tend"); - for (i = 0; i < n; ++i) { - printf("\t%s_cov", argv[i+optind+1]); - } - if (min_depth >= 0) { - for (i = 0; i < n; ++i) - printf("\t%s_depth", argv[i+optind+1]); - } - if (do_rcount) { - for (i = 0; i < n; ++i) - printf("\t%s_count", argv[i+optind+1]); - } - putchar('\n'); - } - ks = ks_init(fp); n_plp = calloc(n, sizeof(int)); plp = calloc(n, sizeof(bam_pileup1_t*)); @@ -221,12 +256,36 @@ int main_bedcov(int argc, char *argv[]) int64_t beg = 0, end = 0; bam_mplp_t mplp; - if (str.l == 0 || *str.s == '#') continue; /* empty or comment line */ + if (str.l == 0) { + continue; /* empty */ + } + if (*str.s == '#') { // header or comment + if (!hdr && !strncmp(str.s, HDR_CHROM, sizeof(HDR_CHROM) - 1)) { + //header line and header output set + output_header(stdout, str.s, -1, n, argv, min_depth, do_rcount); + hdr = 1; + } + continue; // comment line or header + } /* Track and browser lines. Also look for a trailing *space* in case someone has badly-chosen a chromosome name (it would be followed by a tab in that case). */ if (strncmp(str.s, "track ", 6) == 0) continue; if (strncmp(str.s, "browser ", 8) == 0) continue; + if (!hdr) { + //no header line, header output set, find no of fields from bed line + //no header line yet and need header, find no of fields in bed line + int fields = 0; + char *tmp = str.s; + while (*tmp) { + if (*tmp++ == '\t') { + fields++; + } + } + output_header(stdout, NULL, fields + 1, n, argv, min_depth, do_rcount); + hdr = 1; + //continue the processing of bed data line + } for (p = q = str.s; *p && !isspace(*p); ++p); if (*p == 0) goto bed_error; char c = *p; diff --git a/samtools/bedcov.c.pysam.c b/samtools/bedcov.c.pysam.c index c6f0cccd..be66fbf6 100644 --- a/samtools/bedcov.c.pysam.c +++ b/samtools/bedcov.c.pysam.c @@ -53,13 +53,16 @@ typedef struct { int64_t rcnt; } aux_t; +#define HDR_CHROM "#chrom\t" + static int read_bam(void *data, bam1_t *b) { aux_t *aux = (aux_t*)data; // data in fact is a pointer to an auxiliary structure int ret; while (1) { - ret = aux->iter? sam_itr_next(aux->fp, aux->iter, b) : sam_read1(aux->fp, aux->header, b); + ret = aux->iter? sam_itr_next(aux->fp, aux->iter, b) : + sam_read1(aux->fp, aux->header, b); if ( ret<0 ) break; if ( b->core.flag & aux->flags ) continue; if ( (int)b->core.qual < aux->min_mapQ ) continue; @@ -74,6 +77,50 @@ static int incr_rcnt(void *data, const bam1_t *b, bam_pileup_cd *cd) { return 0; } +/// output_header - dump the header in output +/** @param fp - pointer to output file +* @param hdr - header from bed file, when it has one +* @param fields - field count to fill with \t when it doesn't have a header +* @param filecount - no. of input files +* @param argv - input files, for header naming +* @param depth - depth threshold configuration +* @param rcount - show read count configuration +* returns nothing +*/ +static void output_header(FILE *fp, char *hdr, int fields, int filecount, + char *argv[], int depth, int rcount) +{ + int i = 0; + if (hdr) { //header available from bed file + fprintf(fp, "%s", hdr); + } else { + /* no header in bed, add one. add headers as defined in format. + use empty header with tab separation for fields above those defined in + format */ + char *bedcols[] = { "chrom", "chromStart", "chromEnd", "name", "score", + "strand", "thickStart", "thickEnd", "itemRgb", "blockCount", + "blockSizes", "blockStarts"}; + for (i = 0; i < fields; ++i) { + fprintf(fp, "%s%s", (i ? "\t" : "#"), + (i < sizeof(bedcols)/sizeof(bedcols[i]) ? bedcols[i] : ".")); + } + } + for (i = 0; i < filecount; ++i) { //coverage header + fprintf(fp, "\t%s_cov", argv[i + optind + 1]); + } + if (depth >= 0) { //depth header + for (i = 0; i < filecount; ++i) { + fprintf(fp, "\t%s_depth", argv[i + optind + 1]); + } + } + if (rcount) { //read count header + for (i = 0; i < filecount; ++i) { + fprintf(fp, "\t%s_count", argv[i + optind + 1]); + } + } + fprintf(fp, "\n"); +} + int main_bedcov(int argc, char *argv[]) { gzFile fp; @@ -87,7 +134,8 @@ int main_bedcov(int argc, char *argv[]) const bam_pileup1_t **plp; int usage = 0, has_index_file = 0; uint32_t flags = (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP); - int tflags = 0, min_depth = -1, max_depth = DEFAULT_DEPTH, print_header=0; + int tflags = 0, min_depth = -1, max_depth = DEFAULT_DEPTH, print_header=0, + hdr = 0; sam_global_args ga = SAM_GLOBAL_ARGS_INIT; static const struct option lopts[] = { @@ -156,6 +204,9 @@ int main_bedcov(int argc, char *argv[]) n = argc - optind - 1; } + if (!print_header) { // no header output needed, avoid check for header line + hdr = 1; + } memset(&str, 0, sizeof(kstring_t)); aux = calloc(n, sizeof(aux_t*)); idx = calloc(n, sizeof(hts_idx_t*)); @@ -198,22 +249,6 @@ int main_bedcov(int argc, char *argv[]) return 2; } - if (print_header) { - fprintf(samtools_stdout, "#chrom\tstart\tend"); - for (i = 0; i < n; ++i) { - fprintf(samtools_stdout, "\t%s_cov", argv[i+optind+1]); - } - if (min_depth >= 0) { - for (i = 0; i < n; ++i) - fprintf(samtools_stdout, "\t%s_depth", argv[i+optind+1]); - } - if (do_rcount) { - for (i = 0; i < n; ++i) - fprintf(samtools_stdout, "\t%s_count", argv[i+optind+1]); - } - fputc('\n', samtools_stdout); - } - ks = ks_init(fp); n_plp = calloc(n, sizeof(int)); plp = calloc(n, sizeof(bam_pileup1_t*)); @@ -223,12 +258,36 @@ int main_bedcov(int argc, char *argv[]) int64_t beg = 0, end = 0; bam_mplp_t mplp; - if (str.l == 0 || *str.s == '#') continue; /* empty or comment line */ + if (str.l == 0) { + continue; /* empty */ + } + if (*str.s == '#') { // header or comment + if (!hdr && !strncmp(str.s, HDR_CHROM, sizeof(HDR_CHROM) - 1)) { + //header line and header output set + output_header(samtools_stdout, str.s, -1, n, argv, min_depth, do_rcount); + hdr = 1; + } + continue; // comment line or header + } /* Track and browser lines. Also look for a trailing *space* in case someone has badly-chosen a chromosome name (it would be followed by a tab in that case). */ if (strncmp(str.s, "track ", 6) == 0) continue; if (strncmp(str.s, "browser ", 8) == 0) continue; + if (!hdr) { + //no header line, header output set, find no of fields from bed line + //no header line yet and need header, find no of fields in bed line + int fields = 0; + char *tmp = str.s; + while (*tmp) { + if (*tmp++ == '\t') { + fields++; + } + } + output_header(samtools_stdout, NULL, fields + 1, n, argv, min_depth, do_rcount); + hdr = 1; + //continue the processing of bed data line + } for (p = q = str.s; *p && !isspace(*p); ++p); if (*p == 0) goto bed_error; char c = *p; diff --git a/samtools/consensus_pileup.c b/samtools/consensus_pileup.c index c9667b3c..1a1b4d0c 100644 --- a/samtools/consensus_pileup.c +++ b/samtools/consensus_pileup.c @@ -342,8 +342,9 @@ int pileup_loop(samFile *fp, hts_pos_t pos; r = seq_fetch(client_data, fp, h, &pnew->b); + //fprintf(stderr, "Fetch at %ld\n", pnew->b.core.pos); if (r < -1) { - fprintf(stderr, "bam_next_seq() failure.\n"); + fprintf(stderr, "pileup_loop() seq_fetch failure.\n"); goto error; } @@ -548,8 +549,11 @@ int pileup_loop(samFile *fp, if (seq_init) { int v; v = seq_init(client_data, fp, h, p); - if (v == -1) + if (v == -1) { + p->next = pfree; + pfree = p; goto error; + } if (v == 1) { /* Keep this seq */ @@ -602,3 +606,4 @@ int pileup_loop(samFile *fp, return ret; } + diff --git a/samtools/consensus_pileup.c.pysam.c b/samtools/consensus_pileup.c.pysam.c index adb68699..ee497d86 100644 --- a/samtools/consensus_pileup.c.pysam.c +++ b/samtools/consensus_pileup.c.pysam.c @@ -344,8 +344,9 @@ int pileup_loop(samFile *fp, hts_pos_t pos; r = seq_fetch(client_data, fp, h, &pnew->b); + //fprintf(samtools_stderr, "Fetch at %ld\n", pnew->b.core.pos); if (r < -1) { - fprintf(samtools_stderr, "bam_next_seq() failure.\n"); + fprintf(samtools_stderr, "pileup_loop() seq_fetch failure.\n"); goto error; } @@ -550,8 +551,11 @@ int pileup_loop(samFile *fp, if (seq_init) { int v; v = seq_init(client_data, fp, h, p); - if (v == -1) + if (v == -1) { + p->next = pfree; + pfree = p; goto error; + } if (v == 1) { /* Keep this seq */ @@ -604,3 +608,4 @@ int pileup_loop(samFile *fp, return ret; } + diff --git a/samtools/consensus_pileup.h b/samtools/consensus_pileup.h index cc400aa2..f1ed5b61 100644 --- a/samtools/consensus_pileup.h +++ b/samtools/consensus_pileup.h @@ -1,6 +1,6 @@ /* consensus_pileup.h -- Pileup orientated data per consensus column - Copyright (C) 2013-2016, 2020-2022 Genome Research Ltd. + Copyright (C) 2013-2016, 2020-2022, 2024 Genome Research Ltd. Author: James Bonfied @@ -100,3 +100,59 @@ int pileup_loop(samFile *fp, sam_hdr_t *h, pileup_t *p), void *client_data); + + +// A class-like set of data and callback functions +typedef struct { + // Caller private data, passed back to the callbacks + void *client_data; + + // A new client context + void *client_context; + + // Creates a new client context. + // Called once per block of threaded data. + void (*context_new)(void *client_data, + samFile *fp, + sam_hdr_t *h); + + // Called when a client context goes out of scope. + // This is only ever called in chr/pos order. + int (*context_free)(void *client_data, + void *client_context); + + // Read a new sequence record + int (*seq_fetch)(void *client_data, + void *client_context, + samFile *fp, + sam_hdr_t *h, + bam1_t *b); + + // Called once per sequence, the first time it is used. + int (*seq_init)(void *client_data, + void *client_context, + samFile *fp, + sam_hdr_t *h, + pileup_t *p); + + // Called once per consensus pileup + int (*seq_column)(void *client_data, + void *client_context, + samFile *fp, + sam_hdr_t *h, + pileup_t *p, + int depth, + hts_pos_t pos, + int nth, + int is_insert); + + // Called once per sequence, once it's no longer needed. + void (*seq_free)(void *client_data, + samFile *fp, + sam_hdr_t *h, + pileup_t *p); +} pileup_context; + +//int pileup_loop_parallel(samFile *fp, +// sam_hdr_t *h, +// pileup_context *ctx); diff --git a/samtools/faidx.c b/samtools/faidx.c index 616d8252..5abbb4ad 100644 --- a/samtools/faidx.c +++ b/samtools/faidx.c @@ -56,12 +56,29 @@ DEALINGS IN THE SOFTWARE. # define ABS(x) ((x)>=0?(x):-(x)) #endif +/// holds the indexing info for each read name and offsets +typedef struct idx_entry { + char *name; //name + uint64_t seq_offset; //offset to sequence for given read/reference + uint64_t seq_length; //length of sequence + uint64_t qual_offset; //offset to qualify val for given read + uint64_t line_length; //line length with output is made +} idx_entry; + +/// index information about output +typedef struct idx { + size_t n, m; //no of used and max items in index + enum fai_format_options format; //fasta or fastq + idx_entry *indx; //array of index info per sequence + uint64_t offset; //accumulated offset +} idx; + //new params required for output creation typedef struct output { int isbgzip; //is bgzip or uncompressed file - FILE *fp; //uncompressed file pointer BGZF *bgzf_fp; //bgzf file pointer sam_global_args *gopt; //options + idx *idxdata; //index information kstring_t buffer; } output; @@ -110,20 +127,99 @@ static void reverse(char *str, const hts_pos_t len) { } } -/// wrappedwrite - wraps the fwrite and bgzf_write -/** @param out - pointer to data required to write output -* @param buffer - data to write -* @param length - data length -* returns error or length written on success +/// allocidx - allocates required index data buffers +/** @param in - pointer to idx structure + returns NULL on failure + returns index data buffer on success */ -static inline size_t wrappedwrite(output *out, const char *buffer, size_t length) +static inline idx_entry* allocidx(idx* in) { - return out->isbgzip ? bgzf_write(out->bgzf_fp, buffer, length) : - fwrite(buffer, 1, length, out->fp); + if (in && in->n >= in->m) { + size_t newlen = in->m < 1 ? 16 : in->m << 1; //double on reallocation + idx_entry *tmp = realloc(in->indx, newlen * sizeof(*tmp)); + if (!tmp) { + return NULL; + } + size_t count = newlen - in->n; + memset(tmp + in->n, 0, count * sizeof(*tmp)); + in->indx = tmp; + in->m = newlen; + } + + return &in->indx[in->n++]; +} + +/// writeindex - writes index data +/** @param out - pointer to output structure + * @param output_file - pointer to output file name + returns non zero on failure + returns 0 on success + seq name and offsets are written on fai index, for both compressed and + uncompressed outputs. gzi index, dumped through bgzf api, gives the index + of plain offsets in compressed file +*/ +int writeindex(output *out, char *output_file) +{ + idx *idxdata = out->idxdata; + kstring_t fainame = KS_INITIALIZE, buffer = KS_INITIALIZE; + int ret = 0; + FILE *fp = NULL; + size_t i = 0; + + ksprintf(&fainame, "%s.fai", output_file); + + if (!(fp = fopen(fainame.s, "w"))) { + fprintf(stderr, "[faidx] Failed to create index file for output.\n"); + ret = 1; + goto end; + } + + // Write fai index data / index on plain - uncompressed data. + // Note on Windows htslib's hfile_oflags() and hopen_fd_stdinout() + // functions guarantee we'll set O_BINARY so the line length is always + // sequence length +1 regardless of the system native line ending. + for (i = 0; i < idxdata->n; ++i) { + idx_entry *e = &idxdata->indx[i]; + ks_clear(&buffer); + if (idxdata->format == FAI_FASTA) { + //name, seq leng, seq offset, seq per line, char per line + ksprintf(&buffer, "%s\t%"PRIu64"\t%"PRIu64"\t%"PRIu64"\t%" + PRIu64"\n", + e->name, e->seq_length, e->seq_offset, e->line_length, + e->line_length + 1); + } else { //FAI_FASTQ + //name, seq leng, seq offset, seq/line, char/line, qual offset + ksprintf(&buffer, "%s\t%"PRIu64"\t%"PRIu64"\t%"PRIu64"\t%" + PRIu64"\t%"PRIu64"\n", + e->name, e->seq_length, e->seq_offset, e->line_length, + e->line_length + 1, e->qual_offset); + } + if (buffer.l != fwrite(buffer.s, 1, buffer.l, fp)) { + fprintf(stderr, "[faidx] Failed to create fai index file for " + "output.\n"); + ret = 1; + goto end; + } + } + //write gzi index data, index on compressed file + if (out->isbgzip && bgzf_index_dump(out->bgzf_fp, output_file, ".gzi")) { + fprintf(stderr, "[faidx] Failed to create index gzi file for " + "output.\n"); + ret = 1; + } +end: + if (fp) { + fclose(fp); + } + ks_free(&buffer); + ks_free(&fainame); + + return ret; } -static int write_line(faidx_t *faid, output *out, const char *line, const char *name, - const int ignore, const hts_pos_t length, const hts_pos_t seq_len) { +static int write_line(faidx_t *faid, output *out, const char *line, + const char *name, const int ignore, + const hts_pos_t length, const hts_pos_t seq_len) { int id; hts_pos_t beg, end; @@ -131,9 +227,9 @@ static int write_line(faidx_t *faid, output *out, const char *line, const char * fprintf(stderr, "[faidx] Failed to fetch sequence in %s\n", name); if (ignore && seq_len == -2) { - return EXIT_SUCCESS; + return 0; } else { - return EXIT_FAILURE; + return -1; } } else if (seq_len == 0) { fprintf(stderr, "[faidx] Zero length sequence: %s\n", name); @@ -147,24 +243,26 @@ static int write_line(faidx_t *faid, output *out, const char *line, const char * for (i = 0; i < seq_sz; i += length) { hts_pos_t len = i + length < seq_sz ? length : seq_sz - i; - if (wrappedwrite(out, line + i, len) < len || - wrappedwrite(out, "\n", 1) < 1) { + if (bgzf_write(out->bgzf_fp, line + i, len) < len || + bgzf_write(out->bgzf_fp, "\n", 1) < 1) { print_error_errno("faidx", "failed to write output"); - return EXIT_FAILURE; + return -1; } } - return EXIT_SUCCESS; + return 0; } -static int write_output(faidx_t *faid, output *out, const char *name, const int ignore, - const hts_pos_t length, const int rev, - const char *pos_strand_name, const char *neg_strand_name, +static int write_output(faidx_t *faid, output *out, const char *name, + const int ignore, const hts_pos_t length, const int rev, + const char *pos_strand_name, + const char *neg_strand_name, enum fai_format_options format) { hts_pos_t seq_len, wrap_len = length, len = 0; char *seq = NULL, *qual = NULL; int ret = EXIT_FAILURE; + char *idx_name = NULL; if (wrap_len < 0) wrap_len = fai_line_length(faid, name); @@ -175,44 +273,74 @@ static int write_output(faidx_t *faid, output *out, const char *name, const int if (rev && seq_len > 0) { reverse_complement(seq, seq_len); } + //write the name - len = ksprintf(&out->buffer, "%c%s%s\n", format == FAI_FASTA ? '>' : '@', name, rev ? neg_strand_name : pos_strand_name); - if (wrappedwrite(out, out->buffer.s, out->buffer.l) < len) { - fprintf(stderr,"[faidx] Failed to write buffer\n"); - goto exit; - } ks_clear(&out->buffer); - //write bases - if ((ret = write_line(faid, out, seq, name, ignore, wrap_len, seq_len) == EXIT_FAILURE)) { - goto exit; + len = ksprintf(&out->buffer, "%c%s%s\n", + format == FAI_FASTA ? '>' : '@', name, + rev ? neg_strand_name : pos_strand_name); + if (out->gopt->write_index) { + if (!(idx_name = strdup(out->buffer.s+1))) { + fprintf(stderr,"[faidx] Failed to allocate memory.\n"); + goto end; + } + idx_name[out->buffer.l-2] = 0; // remove \n + } + if (bgzf_write(out->bgzf_fp, out->buffer.s, out->buffer.l) < len) { + fprintf(stderr,"[faidx] Failed to write buffer.\n"); + goto end; } + //write bases + if (write_line(faid, out, seq, name, ignore, wrap_len, seq_len) < 0) + goto end; + + uint64_t seq_sz; + seq_sz = seq_len + seq_len / wrap_len + ((seq_len % wrap_len) ? 1 : 0); + if (format == FAI_FASTQ) { //write quality qual = fai_fetchqual64(faid, name, &seq_len); - if (rev && seq_len > 0) { + if (rev && seq_len > 0) reverse(qual, seq_len); - } - len = ksprintf(&out->buffer, "+\n"); - if (wrappedwrite(out, out->buffer.s, out->buffer.l) < len) { + if (bgzf_write(out->bgzf_fp, "+\n", 2) != 2) { fprintf(stderr,"[faidx] Failed to write buffer\n"); - goto exit; + goto end; } - ks_clear(&out->buffer); - if ((ret = write_line(faid, out, qual, name, ignore, wrap_len, seq_len) == EXIT_FAILURE)) { - goto exit; + + if (write_line(faid, out, qual, name, ignore, wrap_len, seq_len) < 0) + goto end; + } + + if (out->gopt->write_index) { + // On-the-fly index construction + idx_entry *e = NULL; + if (out->gopt->write_index && !(e = allocidx(out->idxdata))) { + fprintf(stderr, "[faidx] Failed to allocate memory.\n"); + goto end; + } + + e->name = idx_name; + e->seq_offset = out->idxdata->offset + len; + e->seq_length = seq_len; + e->line_length = seq_len < wrap_len ? seq_len : wrap_len; + idx_name = NULL; + if (out->idxdata->format == FAI_FASTA) { + out->idxdata->offset = e->seq_offset + seq_sz; + } else { // FASTQ + e->qual_offset = e->seq_offset + seq_sz + 2; // "+\n" + out->idxdata->offset = e->qual_offset + seq_sz; } } + ret = EXIT_SUCCESS; -exit: - if (seq) { - free(seq); - } - if (qual) { - free(qual); - } +end: + free(seq); + free(qual); + free(idx_name); + return ret; } @@ -272,7 +400,7 @@ static int usage(FILE *fp, enum fai_format_options format, int exit_status) } fprintf(fp, " -h, --help This message.\n"); - sam_global_opt_help(fp, "---.-@--"); + sam_global_opt_help(fp, "---.-@.-"); return exit_status; } @@ -289,8 +417,9 @@ int faidx_core(int argc, char *argv[], enum fai_format_options format) char *fai_name = NULL; // specified index name char *gzi_name = NULL; // specified compressed index name sam_global_args ga = SAM_GLOBAL_ARGS_INIT; - int exit_status = EXIT_FAILURE, flushed = 0; - struct output out = { 0, stdout, NULL, &ga, KS_INITIALIZE}; //data required for output writing + int exit_status = EXIT_FAILURE; + idx idxdata = { 0, 0, FAI_FASTA, NULL}; + struct output out = { 0, NULL, &ga, &idxdata, KS_INITIALIZE}; //data required for output writing faidx_t *fai = NULL; hts_tpool *pool = NULL; @@ -444,11 +573,9 @@ int faidx_core(int argc, char *argv[], enum fai_format_options format) fprintf(stderr,"[faidx] Same input/output : %s\n", output_file); goto exit2; } - if (!out.isbgzip) { - out.fp = fopen( output_file, "w" ); - } else { + char mode[13] = ""; + if (out.isbgzip) { hts_opt *opts = (hts_opt *)(out.gopt->out.specific); - char mode[13] = "w"; int level = 4; //default compression level while (opts) { if (opts->opt == HTS_OPT_COMPRESSION_LEVEL) { //compression level @@ -460,18 +587,34 @@ int faidx_core(int argc, char *argv[], enum fai_format_options format) if (level >= 0) { snprintf(mode, sizeof(mode), "w%d", level); //pass compression with mode } - out.bgzf_fp = bgzf_open(output_file, mode); + } else { + snprintf(mode, sizeof(mode), "wu"); //uncompressed output } + out.bgzf_fp = bgzf_open(output_file, mode); - if( (!out.isbgzip && out.fp == NULL) || (out.isbgzip && out.bgzf_fp == NULL)) { + if( out.bgzf_fp == NULL) { fprintf(stderr,"[faidx] Cannot open \"%s\" for writing :%s.\n", output_file, strerror(errno) ); goto exit2; } - if (out.isbgzip && pool) { //use thread pool if set + + if (ga.write_index) { + out.idxdata->format = format; + if(out.isbgzip && bgzf_index_build_init(out.bgzf_fp)) { + fprintf(stderr, "[faidx] Failed to setup indexing.\n"); + goto exit1; + } + } + + if (pool) { //use thread pool if set if (bgzf_thread_pool(out.bgzf_fp, pool, 0)) { fprintf(stderr, "Failed to set thread pool for writing\n"); } } + } else { + if (!(out.bgzf_fp = bgzf_open("-", "wu"))) { + fprintf(stderr,"[faidx] Cannot open output for writing :%s.\n", strerror(errno) ); + goto exit2; + } } if (region_file) { @@ -497,21 +640,23 @@ int faidx_core(int argc, char *argv[], enum fai_format_options format) exit_status = write_output(fai, &out, argv[optind], ignore_error, line_len, rev, pos_strand_name, neg_strand_name, format); } - flushed = out.isbgzip ? bgzf_flush(out.bgzf_fp) : fflush(out.fp); - if (flushed == EOF) { + if (bgzf_flush(out.bgzf_fp) == EOF) { print_error_errno("faidx", "Failed to flush output\n"); exit_status = EXIT_FAILURE; } exit1: - if( output_file != NULL && !out.isbgzip) { - fclose(out.fp); //no need to check result as already flushed - } else if( output_file != NULL && out.isbgzip) { + + if(ga.write_index && output_file) { + if (writeindex(&out, output_file)) { + print_error_errno("faidx", "Failed to create index\n"); + exit_status = EXIT_FAILURE; + } + } if (bgzf_close(out.bgzf_fp) < 0) { print_error_errno("faidx", "Failed to close output\n"); exit_status = EXIT_FAILURE; } - } exit2: if (strand_names) { @@ -523,6 +668,13 @@ int faidx_core(int argc, char *argv[], enum fai_format_options format) if (pool) { hts_tpool_destroy(pool); } + if (out.idxdata) { + int i; + for (i = 0; i < out.idxdata->n; ++i) { + free(out.idxdata->indx[i].name); + } + free(out.idxdata->indx); + } sam_global_args_free(&ga); ks_free(&out.buffer); diff --git a/samtools/faidx.c.pysam.c b/samtools/faidx.c.pysam.c index f40209b4..80aecc62 100644 --- a/samtools/faidx.c.pysam.c +++ b/samtools/faidx.c.pysam.c @@ -58,12 +58,29 @@ DEALINGS IN THE SOFTWARE. # define ABS(x) ((x)>=0?(x):-(x)) #endif +/// holds the indexing info for each read name and offsets +typedef struct idx_entry { + char *name; //name + uint64_t seq_offset; //offset to sequence for given read/reference + uint64_t seq_length; //length of sequence + uint64_t qual_offset; //offset to qualify val for given read + uint64_t line_length; //line length with output is made +} idx_entry; + +/// index information about output +typedef struct idx { + size_t n, m; //no of used and max items in index + enum fai_format_options format; //fasta or fastq + idx_entry *indx; //array of index info per sequence + uint64_t offset; //accumulated offset +} idx; + //new params required for output creation typedef struct output { int isbgzip; //is bgzip or uncompressed file - FILE *fp; //uncompressed file pointer BGZF *bgzf_fp; //bgzf file pointer sam_global_args *gopt; //options + idx *idxdata; //index information kstring_t buffer; } output; @@ -112,20 +129,99 @@ static void reverse(char *str, const hts_pos_t len) { } } -/// wrappedwrite - wraps the fwrite and bgzf_write -/** @param out - pointer to data required to write output -* @param buffer - data to write -* @param length - data length -* returns error or length written on success +/// allocidx - allocates required index data buffers +/** @param in - pointer to idx structure + returns NULL on failure + returns index data buffer on success */ -static inline size_t wrappedwrite(output *out, const char *buffer, size_t length) +static inline idx_entry* allocidx(idx* in) { - return out->isbgzip ? bgzf_write(out->bgzf_fp, buffer, length) : - fwrite(buffer, 1, length, out->fp); + if (in && in->n >= in->m) { + size_t newlen = in->m < 1 ? 16 : in->m << 1; //double on reallocation + idx_entry *tmp = realloc(in->indx, newlen * sizeof(*tmp)); + if (!tmp) { + return NULL; + } + size_t count = newlen - in->n; + memset(tmp + in->n, 0, count * sizeof(*tmp)); + in->indx = tmp; + in->m = newlen; + } + + return &in->indx[in->n++]; +} + +/// writeindex - writes index data +/** @param out - pointer to output structure + * @param output_file - pointer to output file name + returns non zero on failure + returns 0 on success + seq name and offsets are written on fai index, for both compressed and + uncompressed outputs. gzi index, dumped through bgzf api, gives the index + of plain offsets in compressed file +*/ +int writeindex(output *out, char *output_file) +{ + idx *idxdata = out->idxdata; + kstring_t fainame = KS_INITIALIZE, buffer = KS_INITIALIZE; + int ret = 0; + FILE *fp = NULL; + size_t i = 0; + + ksprintf(&fainame, "%s.fai", output_file); + + if (!(fp = fopen(fainame.s, "w"))) { + fprintf(samtools_stderr, "[faidx] Failed to create index file for output.\n"); + ret = 1; + goto end; + } + + // Write fai index data / index on plain - uncompressed data. + // Note on Windows htslib's hfile_oflags() and hopen_fd_stdinout() + // functions guarantee we'll set O_BINARY so the line length is always + // sequence length +1 regardless of the system native line ending. + for (i = 0; i < idxdata->n; ++i) { + idx_entry *e = &idxdata->indx[i]; + ks_clear(&buffer); + if (idxdata->format == FAI_FASTA) { + //name, seq leng, seq offset, seq per line, char per line + ksprintf(&buffer, "%s\t%"PRIu64"\t%"PRIu64"\t%"PRIu64"\t%" + PRIu64"\n", + e->name, e->seq_length, e->seq_offset, e->line_length, + e->line_length + 1); + } else { //FAI_FASTQ + //name, seq leng, seq offset, seq/line, char/line, qual offset + ksprintf(&buffer, "%s\t%"PRIu64"\t%"PRIu64"\t%"PRIu64"\t%" + PRIu64"\t%"PRIu64"\n", + e->name, e->seq_length, e->seq_offset, e->line_length, + e->line_length + 1, e->qual_offset); + } + if (buffer.l != fwrite(buffer.s, 1, buffer.l, fp)) { + fprintf(samtools_stderr, "[faidx] Failed to create fai index file for " + "output.\n"); + ret = 1; + goto end; + } + } + //write gzi index data, index on compressed file + if (out->isbgzip && bgzf_index_dump(out->bgzf_fp, output_file, ".gzi")) { + fprintf(samtools_stderr, "[faidx] Failed to create index gzi file for " + "output.\n"); + ret = 1; + } +end: + if (fp) { + fclose(fp); + } + ks_free(&buffer); + ks_free(&fainame); + + return ret; } -static int write_line(faidx_t *faid, output *out, const char *line, const char *name, - const int ignore, const hts_pos_t length, const hts_pos_t seq_len) { +static int write_line(faidx_t *faid, output *out, const char *line, + const char *name, const int ignore, + const hts_pos_t length, const hts_pos_t seq_len) { int id; hts_pos_t beg, end; @@ -133,9 +229,9 @@ static int write_line(faidx_t *faid, output *out, const char *line, const char * fprintf(samtools_stderr, "[faidx] Failed to fetch sequence in %s\n", name); if (ignore && seq_len == -2) { - return EXIT_SUCCESS; + return 0; } else { - return EXIT_FAILURE; + return -1; } } else if (seq_len == 0) { fprintf(samtools_stderr, "[faidx] Zero length sequence: %s\n", name); @@ -149,24 +245,26 @@ static int write_line(faidx_t *faid, output *out, const char *line, const char * for (i = 0; i < seq_sz; i += length) { hts_pos_t len = i + length < seq_sz ? length : seq_sz - i; - if (wrappedwrite(out, line + i, len) < len || - wrappedwrite(out, "\n", 1) < 1) { + if (bgzf_write(out->bgzf_fp, line + i, len) < len || + bgzf_write(out->bgzf_fp, "\n", 1) < 1) { print_error_errno("faidx", "failed to write output"); - return EXIT_FAILURE; + return -1; } } - return EXIT_SUCCESS; + return 0; } -static int write_output(faidx_t *faid, output *out, const char *name, const int ignore, - const hts_pos_t length, const int rev, - const char *pos_strand_name, const char *neg_strand_name, +static int write_output(faidx_t *faid, output *out, const char *name, + const int ignore, const hts_pos_t length, const int rev, + const char *pos_strand_name, + const char *neg_strand_name, enum fai_format_options format) { hts_pos_t seq_len, wrap_len = length, len = 0; char *seq = NULL, *qual = NULL; int ret = EXIT_FAILURE; + char *idx_name = NULL; if (wrap_len < 0) wrap_len = fai_line_length(faid, name); @@ -177,44 +275,74 @@ static int write_output(faidx_t *faid, output *out, const char *name, const int if (rev && seq_len > 0) { reverse_complement(seq, seq_len); } + //write the name - len = ksprintf(&out->buffer, "%c%s%s\n", format == FAI_FASTA ? '>' : '@', name, rev ? neg_strand_name : pos_strand_name); - if (wrappedwrite(out, out->buffer.s, out->buffer.l) < len) { - fprintf(samtools_stderr,"[faidx] Failed to write buffer\n"); - goto exit; - } ks_clear(&out->buffer); - //write bases - if ((ret = write_line(faid, out, seq, name, ignore, wrap_len, seq_len) == EXIT_FAILURE)) { - goto exit; + len = ksprintf(&out->buffer, "%c%s%s\n", + format == FAI_FASTA ? '>' : '@', name, + rev ? neg_strand_name : pos_strand_name); + if (out->gopt->write_index) { + if (!(idx_name = strdup(out->buffer.s+1))) { + fprintf(samtools_stderr,"[faidx] Failed to allocate memory.\n"); + goto end; + } + idx_name[out->buffer.l-2] = 0; // remove \n + } + if (bgzf_write(out->bgzf_fp, out->buffer.s, out->buffer.l) < len) { + fprintf(samtools_stderr,"[faidx] Failed to write buffer.\n"); + goto end; } + //write bases + if (write_line(faid, out, seq, name, ignore, wrap_len, seq_len) < 0) + goto end; + + uint64_t seq_sz; + seq_sz = seq_len + seq_len / wrap_len + ((seq_len % wrap_len) ? 1 : 0); + if (format == FAI_FASTQ) { //write quality qual = fai_fetchqual64(faid, name, &seq_len); - if (rev && seq_len > 0) { + if (rev && seq_len > 0) reverse(qual, seq_len); - } - len = ksprintf(&out->buffer, "+\n"); - if (wrappedwrite(out, out->buffer.s, out->buffer.l) < len) { + if (bgzf_write(out->bgzf_fp, "+\n", 2) != 2) { fprintf(samtools_stderr,"[faidx] Failed to write buffer\n"); - goto exit; + goto end; } - ks_clear(&out->buffer); - if ((ret = write_line(faid, out, qual, name, ignore, wrap_len, seq_len) == EXIT_FAILURE)) { - goto exit; + + if (write_line(faid, out, qual, name, ignore, wrap_len, seq_len) < 0) + goto end; + } + + if (out->gopt->write_index) { + // On-the-fly index construction + idx_entry *e = NULL; + if (out->gopt->write_index && !(e = allocidx(out->idxdata))) { + fprintf(samtools_stderr, "[faidx] Failed to allocate memory.\n"); + goto end; + } + + e->name = idx_name; + e->seq_offset = out->idxdata->offset + len; + e->seq_length = seq_len; + e->line_length = seq_len < wrap_len ? seq_len : wrap_len; + idx_name = NULL; + if (out->idxdata->format == FAI_FASTA) { + out->idxdata->offset = e->seq_offset + seq_sz; + } else { // FASTQ + e->qual_offset = e->seq_offset + seq_sz + 2; // "+\n" + out->idxdata->offset = e->qual_offset + seq_sz; } } + ret = EXIT_SUCCESS; -exit: - if (seq) { - free(seq); - } - if (qual) { - free(qual); - } +end: + free(seq); + free(qual); + free(idx_name); + return ret; } @@ -274,7 +402,7 @@ static int usage(FILE *fp, enum fai_format_options format, int exit_status) } fprintf(fp, " -h, --help This message.\n"); - sam_global_opt_help(fp, "---.-@--"); + sam_global_opt_help(fp, "---.-@.-"); return exit_status; } @@ -291,8 +419,9 @@ int faidx_core(int argc, char *argv[], enum fai_format_options format) char *fai_name = NULL; // specified index name char *gzi_name = NULL; // specified compressed index name sam_global_args ga = SAM_GLOBAL_ARGS_INIT; - int exit_status = EXIT_FAILURE, flushed = 0; - struct output out = { 0, samtools_stdout, NULL, &ga, KS_INITIALIZE}; //data required for output writing + int exit_status = EXIT_FAILURE; + idx idxdata = { 0, 0, FAI_FASTA, NULL}; + struct output out = { 0, NULL, &ga, &idxdata, KS_INITIALIZE}; //data required for output writing faidx_t *fai = NULL; hts_tpool *pool = NULL; @@ -446,11 +575,9 @@ int faidx_core(int argc, char *argv[], enum fai_format_options format) fprintf(samtools_stderr,"[faidx] Same input/output : %s\n", output_file); goto exit2; } - if (!out.isbgzip) { - out.fp = fopen( output_file, "w" ); - } else { + char mode[13] = ""; + if (out.isbgzip) { hts_opt *opts = (hts_opt *)(out.gopt->out.specific); - char mode[13] = "w"; int level = 4; //default compression level while (opts) { if (opts->opt == HTS_OPT_COMPRESSION_LEVEL) { //compression level @@ -462,18 +589,34 @@ int faidx_core(int argc, char *argv[], enum fai_format_options format) if (level >= 0) { snprintf(mode, sizeof(mode), "w%d", level); //pass compression with mode } - out.bgzf_fp = bgzf_open(output_file, mode); + } else { + snprintf(mode, sizeof(mode), "wu"); //uncompressed output } + out.bgzf_fp = bgzf_open(output_file, mode); - if( (!out.isbgzip && out.fp == NULL) || (out.isbgzip && out.bgzf_fp == NULL)) { + if( out.bgzf_fp == NULL) { fprintf(samtools_stderr,"[faidx] Cannot open \"%s\" for writing :%s.\n", output_file, strerror(errno) ); goto exit2; } - if (out.isbgzip && pool) { //use thread pool if set + + if (ga.write_index) { + out.idxdata->format = format; + if(out.isbgzip && bgzf_index_build_init(out.bgzf_fp)) { + fprintf(samtools_stderr, "[faidx] Failed to setup indexing.\n"); + goto exit1; + } + } + + if (pool) { //use thread pool if set if (bgzf_thread_pool(out.bgzf_fp, pool, 0)) { fprintf(samtools_stderr, "Failed to set thread pool for writing\n"); } } + } else { + if (!(out.bgzf_fp = bgzf_open("-", "wu"))) { + fprintf(samtools_stderr,"[faidx] Cannot open output for writing :%s.\n", strerror(errno) ); + goto exit2; + } } if (region_file) { @@ -499,21 +642,23 @@ int faidx_core(int argc, char *argv[], enum fai_format_options format) exit_status = write_output(fai, &out, argv[optind], ignore_error, line_len, rev, pos_strand_name, neg_strand_name, format); } - flushed = out.isbgzip ? bgzf_flush(out.bgzf_fp) : fflush(out.fp); - if (flushed == EOF) { + if (bgzf_flush(out.bgzf_fp) == EOF) { print_error_errno("faidx", "Failed to flush output\n"); exit_status = EXIT_FAILURE; } exit1: - if( output_file != NULL && !out.isbgzip) { - fclose(out.fp); //no need to check result as already flushed - } else if( output_file != NULL && out.isbgzip) { + + if(ga.write_index && output_file) { + if (writeindex(&out, output_file)) { + print_error_errno("faidx", "Failed to create index\n"); + exit_status = EXIT_FAILURE; + } + } if (bgzf_close(out.bgzf_fp) < 0) { print_error_errno("faidx", "Failed to close output\n"); exit_status = EXIT_FAILURE; } - } exit2: if (strand_names) { @@ -525,6 +670,13 @@ int faidx_core(int argc, char *argv[], enum fai_format_options format) if (pool) { hts_tpool_destroy(pool); } + if (out.idxdata) { + int i; + for (i = 0; i < out.idxdata->n; ++i) { + free(out.idxdata->indx[i].name); + } + free(out.idxdata->indx); + } sam_global_args_free(&ga); ks_free(&out.buffer); diff --git a/samtools/phase.c b/samtools/phase.c index 62a278f2..96962631 100644 --- a/samtools/phase.c +++ b/samtools/phase.c @@ -1,7 +1,7 @@ /* phase.c -- phase subcommand. Copyright (C) 2011 Broad Institute. - Copyright (C) 2013-2016, 2019 Genome Research Ltd. + Copyright (C) 2013-2016, 2019, 2024 Genome Research Ltd. Author: Heng Li diff --git a/samtools/phase.c.pysam.c b/samtools/phase.c.pysam.c index c239d232..dbd28d0e 100644 --- a/samtools/phase.c.pysam.c +++ b/samtools/phase.c.pysam.c @@ -3,7 +3,7 @@ /* phase.c -- phase subcommand. Copyright (C) 2011 Broad Institute. - Copyright (C) 2013-2016, 2019 Genome Research Ltd. + Copyright (C) 2013-2016, 2019, 2024 Genome Research Ltd. Author: Heng Li diff --git a/samtools/sam_view.c b/samtools/sam_view.c index 6afd653f..e863d4e4 100644 --- a/samtools/sam_view.c +++ b/samtools/sam_view.c @@ -37,6 +37,7 @@ DEALINGS IN THE SOFTWARE. */ #include "htslib/faidx.h" #include "htslib/khash.h" #include "htslib/kstring.h" +#include "htslib/hfile.h" #include "htslib/thread_pool.h" #include "htslib/hts_expr.h" #include "samtools.h" @@ -85,8 +86,10 @@ typedef struct samview_settings { sam_hdr_t *header; samFile *in, *out, *un_out; int64_t count; + int64_t processed; int is_count; char *fn_in, *fn_idx_in, *fn_out, *fn_fai, *fn_un_out, *fn_out_idx, *fn_un_out_idx; + char *fn_counts; int fetch_pairs, nreglist; hts_reglist_t *reglist; int sanitize; @@ -671,13 +674,17 @@ static int fetch_pairs_collect_mates(samview_settings_t *conf, hts_itr_multi_t * k = kh_get(names,mate_names,bam_get_qname(rec)); if ( k != kh_end(mate_names) ) drop = 0; } + if ( drop ) + continue; int p = 0; - if (!drop && (p=process_aln(conf->header, rec, conf))== 0) { + conf->processed++; + if ((p=process_aln(conf->header, rec, conf)) == 0) { if (adjust_tags(conf->header, rec, conf) != 0) goto out; if (check_sam_write1(conf->out, conf->header, rec, conf->fn_out, &write_error) < 0) goto out; + conf->count++; } if (p < 0) goto out; @@ -707,6 +714,7 @@ static int fetch_pairs_collect_mates(samview_settings_t *conf, hts_itr_multi_t * // Common code for processing and writing a record static inline int process_one_record(samview_settings_t *conf, bam1_t *b, int *write_error) { + conf->processed++; if (conf->sanitize) if (bam_sanitize(conf->header, b, conf->sanitize) < 0) return -1; @@ -815,6 +823,48 @@ static void aux_list_free(samview_settings_t *settings) { kh_destroy(aux_exists, settings->remove_tag); } +static int write_counts_to_file(samview_settings_t *settings) { + kstring_t text = KS_INITIALIZE; + hFILE *outfile = NULL; + int ret = -1; + int r = ksprintf(&text, + "{\n" + " \"records_processed\" : %"PRId64",\n" + " \"records_filter_accepted\" : %"PRId64",\n" + " \"records_filter_rejected\" : %"PRId64"\n" + "}\n", + settings->processed, settings->count, + settings->processed - settings->count); + if (r < 0) { + print_error_errno("view", "failed to make read counts text"); + goto out; + } + outfile = hopen(settings->fn_counts, "w"); + if (!outfile) { + print_error_errno("view", "failed to open \"%s\"", settings->fn_counts); + goto out; + } + if (hwrite(outfile, ks_c_str(&text), ks_len(&text)) != ks_len(&text)) { + print_error_errno("view", "failed to write to \"%s\"", + settings->fn_counts); + goto out; + } + r = hclose(outfile); + outfile = NULL; + if (r < 0) { + print_error_errno("view", "error on closing \"%s\"", + settings->fn_counts); + goto out; + } + ret = 0; + + out: + ks_free(&text); + if (outfile) + hclose_abruptly(outfile); + return ret; +} + int main_samview(int argc, char *argv[]) { samview_settings_t settings; @@ -872,6 +922,7 @@ int main_samview(int argc, char *argv[]) {"remove-flags", required_argument, NULL, LONGOPT('r')}, {"remove-tag", required_argument, NULL, 'x'}, {"require-flags", required_argument, NULL, 'f'}, + {"save-counts", required_argument, NULL, LONGOPT('c')}, {"subsample", required_argument, NULL, LONGOPT('s')}, {"subsample-seed", required_argument, NULL, LONGOPT('S')}, {"tag", required_argument, NULL, 'd'}, @@ -935,6 +986,7 @@ int main_samview(int argc, char *argv[]) settings.count_rf |= SAM_SEQ; break; case 'c': settings.is_count = 1; break; + case LONGOPT('c'): settings.fn_counts = optarg; break; case 'S': break; case 'b': out_format = "b"; break; case 'C': out_format = "c"; break; @@ -1457,6 +1509,11 @@ int main_samview(int argc, char *argv[]) } } + if (settings.fn_counts && ret == 0) { + if (write_counts_to_file(&settings) < 0) + ret = EXIT_FAILURE; + } + // close files, free and return if (settings.in) check_sam_close("view", settings.in, settings.fn_in, "standard input", &ret); if (settings.out) check_sam_close("view", settings.out, settings.fn_out, "standard output", &ret); @@ -1525,12 +1582,14 @@ static int usage(FILE *fp, int exit_status, int is_long_help) " -H, --header-only Print SAM header only (no alignments)\n" " --no-header Print SAM alignment records only [default]\n" " -c, --count Print only the count of matching records\n" +" --save-counts FILE Write counts of passed/failed records to FILE\n" " -o, --output FILE Write output to FILE [standard output]\n" " -U, --unoutput FILE, --output-unselected FILE\n" " Output reads not selected by filters to FILE\n" " -p, --unmap Set flag to UNMAP on reads not selected\n" " then write to output file.\n" " -P, --fetch-pairs Retrieve complete pairs even when outside of region\n" +"\n" "Input options:\n" " -t, --fai-reference FILE FILE listing reference names and lengths\n" " -M, --use-index Use index and multi-region iterator for regions\n" diff --git a/samtools/sam_view.c.pysam.c b/samtools/sam_view.c.pysam.c index d1f55ee9..5392e2bb 100644 --- a/samtools/sam_view.c.pysam.c +++ b/samtools/sam_view.c.pysam.c @@ -39,6 +39,7 @@ DEALINGS IN THE SOFTWARE. */ #include "htslib/faidx.h" #include "htslib/khash.h" #include "htslib/kstring.h" +#include "htslib/hfile.h" #include "htslib/thread_pool.h" #include "htslib/hts_expr.h" #include "samtools.h" @@ -87,8 +88,10 @@ typedef struct samview_settings { sam_hdr_t *header; samFile *in, *out, *un_out; int64_t count; + int64_t processed; int is_count; char *fn_in, *fn_idx_in, *fn_out, *fn_fai, *fn_un_out, *fn_out_idx, *fn_un_out_idx; + char *fn_counts; int fetch_pairs, nreglist; hts_reglist_t *reglist; int sanitize; @@ -673,13 +676,17 @@ static int fetch_pairs_collect_mates(samview_settings_t *conf, hts_itr_multi_t * k = kh_get(names,mate_names,bam_get_qname(rec)); if ( k != kh_end(mate_names) ) drop = 0; } + if ( drop ) + continue; int p = 0; - if (!drop && (p=process_aln(conf->header, rec, conf))== 0) { + conf->processed++; + if ((p=process_aln(conf->header, rec, conf)) == 0) { if (adjust_tags(conf->header, rec, conf) != 0) goto out; if (check_sam_write1(conf->out, conf->header, rec, conf->fn_out, &write_error) < 0) goto out; + conf->count++; } if (p < 0) goto out; @@ -709,6 +716,7 @@ static int fetch_pairs_collect_mates(samview_settings_t *conf, hts_itr_multi_t * // Common code for processing and writing a record static inline int process_one_record(samview_settings_t *conf, bam1_t *b, int *write_error) { + conf->processed++; if (conf->sanitize) if (bam_sanitize(conf->header, b, conf->sanitize) < 0) return -1; @@ -817,6 +825,48 @@ static void aux_list_free(samview_settings_t *settings) { kh_destroy(aux_exists, settings->remove_tag); } +static int write_counts_to_file(samview_settings_t *settings) { + kstring_t text = KS_INITIALIZE; + hFILE *outfile = NULL; + int ret = -1; + int r = ksprintf(&text, + "{\n" + " \"records_processed\" : %"PRId64",\n" + " \"records_filter_accepted\" : %"PRId64",\n" + " \"records_filter_rejected\" : %"PRId64"\n" + "}\n", + settings->processed, settings->count, + settings->processed - settings->count); + if (r < 0) { + print_error_errno("view", "failed to make read counts text"); + goto out; + } + outfile = hopen(settings->fn_counts, "w"); + if (!outfile) { + print_error_errno("view", "failed to open \"%s\"", settings->fn_counts); + goto out; + } + if (hwrite(outfile, ks_c_str(&text), ks_len(&text)) != ks_len(&text)) { + print_error_errno("view", "failed to write to \"%s\"", + settings->fn_counts); + goto out; + } + r = hclose(outfile); + outfile = NULL; + if (r < 0) { + print_error_errno("view", "error on closing \"%s\"", + settings->fn_counts); + goto out; + } + ret = 0; + + out: + ks_free(&text); + if (outfile) + hclose_abruptly(outfile); + return ret; +} + int main_samview(int argc, char *argv[]) { samview_settings_t settings; @@ -874,6 +924,7 @@ int main_samview(int argc, char *argv[]) {"remove-flags", required_argument, NULL, LONGOPT('r')}, {"remove-tag", required_argument, NULL, 'x'}, {"require-flags", required_argument, NULL, 'f'}, + {"save-counts", required_argument, NULL, LONGOPT('c')}, {"subsample", required_argument, NULL, LONGOPT('s')}, {"subsample-seed", required_argument, NULL, LONGOPT('S')}, {"tag", required_argument, NULL, 'd'}, @@ -937,6 +988,7 @@ int main_samview(int argc, char *argv[]) settings.count_rf |= SAM_SEQ; break; case 'c': settings.is_count = 1; break; + case LONGOPT('c'): settings.fn_counts = optarg; break; case 'S': break; case 'b': out_format = "b"; break; case 'C': out_format = "c"; break; @@ -1459,6 +1511,11 @@ int main_samview(int argc, char *argv[]) } } + if (settings.fn_counts && ret == 0) { + if (write_counts_to_file(&settings) < 0) + ret = EXIT_FAILURE; + } + // close files, free and return if (settings.in) check_sam_close("view", settings.in, settings.fn_in, "standard input", &ret); if (settings.out) check_sam_close("view", settings.out, settings.fn_out, "standard output", &ret); @@ -1527,12 +1584,14 @@ static int usage(FILE *fp, int exit_status, int is_long_help) " -H, --header-only Print SAM header only (no alignments)\n" " --no-header Print SAM alignment records only [default]\n" " -c, --count Print only the count of matching records\n" +" --save-counts FILE Write counts of passed/failed records to FILE\n" " -o, --output FILE Write output to FILE [standard output]\n" " -U, --unoutput FILE, --output-unselected FILE\n" " Output reads not selected by filters to FILE\n" " -p, --unmap Set flag to UNMAP on reads not selected\n" " then write to output file.\n" " -P, --fetch-pairs Retrieve complete pairs even when outside of region\n" +"\n" "Input options:\n" " -t, --fai-reference FILE FILE listing reference names and lengths\n" " -M, --use-index Use index and multi-region iterator for regions\n" diff --git a/samtools/samtools.h b/samtools/samtools.h index a244c66c..fb8534f4 100644 --- a/samtools/samtools.h +++ b/samtools/samtools.h @@ -32,18 +32,20 @@ DEALINGS IN THE SOFTWARE. */ const char *samtools_version(void); /* BAM sanitizer options */ -#define FIX_POS 2 -#define FIX_MQUAL 4 -#define FIX_UNMAP 8 -#define FIX_CIGAR 16 -#define FIX_AUX 32 +#define FIX_POS 2 +#define FIX_MQUAL 4 +#define FIX_UNMAP 8 +#define FIX_CIGAR 16 +#define FIX_AUX 32 +#define FIX_CIGDUP 64 +#define FIX_CIGARX 128 // default for position sorted data -#define FIX_ON (FIX_MQUAL|FIX_UNMAP|FIX_CIGAR|FIX_AUX) -#define FIX_ALL 255 +#define FIX_ON (FIX_MQUAL|FIX_UNMAP|FIX_CIGAR|FIX_AUX|FIX_CIGDUP) +#define FIX_ALL 127 -// Parses a comma-separated list of "pos", "mqual", "unmap", "cigar", and "aux" -// keywords for the bam sanitizer. +// Parses a comma-separated list of "pos", "mqual", "unmap", "cigar", "cigdup", +// "cigarx" and "aux" keywords for the bam sanitizer. int bam_sanitize_options(const char *str); // Sanitize a BAM record, using FIX_* bit flags as defined above. diff --git a/samtools/stats.c b/samtools/stats.c index eebfd677..8c802d06 100644 --- a/samtools/stats.c +++ b/samtools/stats.c @@ -1,6 +1,6 @@ /* stats.c -- This is the former bamcheck integrated into samtools/htslib. - Copyright (C) 2012-2024 Genome Research Ltd. + Copyright (C) 2012-2025 Genome Research Ltd. Author: Petr Danecek Author: Sam Nicholls @@ -745,7 +745,7 @@ void update_checksum(bam1_t *bam_line, stats_t *stats) stats->checksum.reads += crc32(0L, seq, (seq_len+1)/2); uint8_t *qual = bam_get_qual(bam_line); - stats->checksum.quals += crc32(0L, qual, (seq_len+1)/2); + stats->checksum.quals += crc32(0L, qual, seq_len); } // Collect statistics about the barcode tags specified by init_barcode_tags method @@ -1254,7 +1254,7 @@ void collect_stats(bam1_t *bam_line, stats_t *stats, khash_t(qn2pair) *read_pair if ( is_fwd*is_mfwd>0 ) stats->isize->inc_other(stats->isize->data, isize); - else if ( is_fst*pos_fst>=0 ) + else if ( is_fst*pos_fst>0 ) { if ( is_fst*is_fwd>0 ) stats->isize->inc_inward(stats->isize->data, isize); @@ -1267,6 +1267,9 @@ void collect_stats(bam1_t *bam_line, stats_t *stats, khash_t(qn2pair) *read_pair stats->isize->inc_outward(stats->isize->data, isize); else stats->isize->inc_inward(stats->isize->data, isize); + } else { + // assume that exactly overlapping reads are inwards + stats->isize->inc_inward(stats->isize->data, isize); } } } @@ -1537,7 +1540,7 @@ void output_stats(FILE *to, stats_t *stats, int sparse) fprintf(to, "SN\traw total sequences:\t%ld\t# excluding supplementary and secondary reads\n", (long)(stats->nreads_filtered+stats->nreads_1st+stats->nreads_2nd+stats->nreads_other)); // not counting excluded seqs (and none of the below) fprintf(to, "SN\tfiltered sequences:\t%ld\n", (long)stats->nreads_filtered); fprintf(to, "SN\tsequences:\t%ld\n", (long)(stats->nreads_1st+stats->nreads_2nd+stats->nreads_other)); - fprintf(to, "SN\tis sorted:\t%d\n", stats->is_sorted ? 1 : 0); + fprintf(to, "SN\tis sorted:\t%d\t# %s by coordinate\n", stats->is_sorted ? 1 : 0, stats->is_sorted ? "sorted" : "not sorted"); fprintf(to, "SN\t1st fragments:\t%ld\n", (long)stats->nreads_1st); fprintf(to, "SN\tlast fragments:\t%ld\n", (long)stats->nreads_2nd); fprintf(to, "SN\treads mapped:\t%ld\n", (long)(stats->nreads_paired_and_mapped+stats->nreads_single_mapped)); @@ -1821,47 +1824,50 @@ void output_stats(FILE *to, stats_t *stats, int sparse) fprintf(to, "IC\t%d\t%ld\t%ld\t%ld\t%ld\n", ilen+1, (long)stats->ins_cycles_1st[ilen], (long)stats->ins_cycles_2nd[ilen], (long)stats->del_cycles_1st[ilen], (long)stats->del_cycles_2nd[ilen]); } - fprintf(to, "# Coverage distribution. Use `grep ^COV | cut -f 2-` to extract this part.\n"); - if ( stats->cov[0] ) - fprintf(to, "COV\t[<%d]\t%d\t%ld\n",stats->info->cov_min,stats->info->cov_min-1, (long)stats->cov[0]); - for (icov=1; icovncov-1; icov++) - if ( stats->cov[icov] ) - fprintf(to, "COV\t[%d-%d]\t%d\t%ld\n",stats->info->cov_min + (icov-1)*stats->info->cov_step, stats->info->cov_min + icov*stats->info->cov_step-1,stats->info->cov_min + icov*stats->info->cov_step-1, (long)stats->cov[icov]); - if ( stats->cov[stats->ncov-1] ) - fprintf(to, "COV\t[%d<]\t%d\t%ld\n",stats->info->cov_min + (stats->ncov-2)*stats->info->cov_step-1,stats->info->cov_min + (stats->ncov-2)*stats->info->cov_step-1, (long)stats->cov[stats->ncov-1]); + if (stats->is_sorted) { + fprintf(to, "# Coverage distribution. Use `grep ^COV | cut -f 2-` to extract this part.\n"); + if ( stats->cov[0] ) + fprintf(to, "COV\t[<%d]\t%d\t%ld\n",stats->info->cov_min,stats->info->cov_min-1, (long)stats->cov[0]); + for (icov=1; icovncov-1; icov++) + if ( stats->cov[icov] ) + fprintf(to, "COV\t[%d-%d]\t%d\t%ld\n",stats->info->cov_min + (icov-1)*stats->info->cov_step, stats->info->cov_min + icov*stats->info->cov_step-1,stats->info->cov_min + icov*stats->info->cov_step-1, (long)stats->cov[icov]); + if ( stats->cov[stats->ncov-1] ) + fprintf(to, "COV\t[%d<]\t%d\t%ld\n",stats->info->cov_min + (stats->ncov-2)*stats->info->cov_step-1,stats->info->cov_min + (stats->ncov-2)*stats->info->cov_step-1, (long)stats->cov[stats->ncov-1]); - // Calculate average GC content, then sort by GC and depth - fprintf(to, "# GC-depth. Use `grep ^GCD | cut -f 2-` to extract this part. The columns are: GC%%, unique sequence percentiles, 10th, 25th, 50th, 75th and 90th depth percentile\n"); - uint32_t igcd; - for (igcd=0; igcdigcd; igcd++) - { - if ( stats->info->fai ) - stats->gcd[igcd].gc = rint(100. * stats->gcd[igcd].gc); - else - if ( stats->gcd[igcd].depth ) - stats->gcd[igcd].gc = rint(100. * stats->gcd[igcd].gc / stats->gcd[igcd].depth); - } - if ( stats->ngcd ) - qsort(stats->gcd, stats->igcd+1, sizeof(gc_depth_t), gcd_cmp); - igcd = 0; - while ( igcd < stats->igcd ) - { - // Calculate percentiles (10,25,50,75,90th) for the current GC content and print - uint32_t nbins=0, itmp=igcd; - float gc = stats->gcd[igcd].gc; - while ( itmpigcd && fabs(stats->gcd[itmp].gc-gc)<0.1 ) + + // Calculate average GC content, then sort by GC and depth + fprintf(to, "# GC-depth. Use `grep ^GCD | cut -f 2-` to extract this part. The columns are: GC%%, unique sequence percentiles, 10th, 25th, 50th, 75th and 90th depth percentile\n"); + uint32_t igcd; + for (igcd=0; igcdigcd; igcd++) + { + if ( stats->info->fai ) + stats->gcd[igcd].gc = rint(100. * stats->gcd[igcd].gc); + else + if ( stats->gcd[igcd].depth ) + stats->gcd[igcd].gc = rint(100. * stats->gcd[igcd].gc / stats->gcd[igcd].depth); + } + if ( stats->ngcd ) + qsort(stats->gcd, stats->igcd+1, sizeof(gc_depth_t), gcd_cmp); + igcd = 0; + while ( igcd < stats->igcd ) { - nbins++; - itmp++; + // Calculate percentiles (10,25,50,75,90th) for the current GC content and print + uint32_t nbins=0, itmp=igcd; + float gc = stats->gcd[igcd].gc; + while ( itmpigcd && fabs(stats->gcd[itmp].gc-gc)<0.1 ) + { + nbins++; + itmp++; + } + fprintf(to, "GCD\t%.1f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\n", gc, (igcd+nbins+1)*100./(stats->igcd+1), + gcd_percentile(&(stats->gcd[igcd]),nbins,10) *avg_read_length/stats->info->gcd_bin_size, + gcd_percentile(&(stats->gcd[igcd]),nbins,25) *avg_read_length/stats->info->gcd_bin_size, + gcd_percentile(&(stats->gcd[igcd]),nbins,50) *avg_read_length/stats->info->gcd_bin_size, + gcd_percentile(&(stats->gcd[igcd]),nbins,75) *avg_read_length/stats->info->gcd_bin_size, + gcd_percentile(&(stats->gcd[igcd]),nbins,90) *avg_read_length/stats->info->gcd_bin_size + ); + igcd += nbins; } - fprintf(to, "GCD\t%.1f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\n", gc, (igcd+nbins+1)*100./(stats->igcd+1), - gcd_percentile(&(stats->gcd[igcd]),nbins,10) *avg_read_length/stats->info->gcd_bin_size, - gcd_percentile(&(stats->gcd[igcd]),nbins,25) *avg_read_length/stats->info->gcd_bin_size, - gcd_percentile(&(stats->gcd[igcd]),nbins,50) *avg_read_length/stats->info->gcd_bin_size, - gcd_percentile(&(stats->gcd[igcd]),nbins,75) *avg_read_length/stats->info->gcd_bin_size, - gcd_percentile(&(stats->gcd[igcd]),nbins,90) *avg_read_length/stats->info->gcd_bin_size - ); - igcd += nbins; } } diff --git a/samtools/stats.c.pysam.c b/samtools/stats.c.pysam.c index 6bd7946b..204e382a 100644 --- a/samtools/stats.c.pysam.c +++ b/samtools/stats.c.pysam.c @@ -2,7 +2,7 @@ /* stats.c -- This is the former bamcheck integrated into samtools/htslib. - Copyright (C) 2012-2024 Genome Research Ltd. + Copyright (C) 2012-2025 Genome Research Ltd. Author: Petr Danecek Author: Sam Nicholls @@ -747,7 +747,7 @@ void update_checksum(bam1_t *bam_line, stats_t *stats) stats->checksum.reads += crc32(0L, seq, (seq_len+1)/2); uint8_t *qual = bam_get_qual(bam_line); - stats->checksum.quals += crc32(0L, qual, (seq_len+1)/2); + stats->checksum.quals += crc32(0L, qual, seq_len); } // Collect statistics about the barcode tags specified by init_barcode_tags method @@ -1256,7 +1256,7 @@ void collect_stats(bam1_t *bam_line, stats_t *stats, khash_t(qn2pair) *read_pair if ( is_fwd*is_mfwd>0 ) stats->isize->inc_other(stats->isize->data, isize); - else if ( is_fst*pos_fst>=0 ) + else if ( is_fst*pos_fst>0 ) { if ( is_fst*is_fwd>0 ) stats->isize->inc_inward(stats->isize->data, isize); @@ -1269,6 +1269,9 @@ void collect_stats(bam1_t *bam_line, stats_t *stats, khash_t(qn2pair) *read_pair stats->isize->inc_outward(stats->isize->data, isize); else stats->isize->inc_inward(stats->isize->data, isize); + } else { + // assume that exactly overlapping reads are inwards + stats->isize->inc_inward(stats->isize->data, isize); } } } @@ -1539,7 +1542,7 @@ void output_stats(FILE *to, stats_t *stats, int sparse) fprintf(to, "SN\traw total sequences:\t%ld\t# excluding supplementary and secondary reads\n", (long)(stats->nreads_filtered+stats->nreads_1st+stats->nreads_2nd+stats->nreads_other)); // not counting excluded seqs (and none of the below) fprintf(to, "SN\tfiltered sequences:\t%ld\n", (long)stats->nreads_filtered); fprintf(to, "SN\tsequences:\t%ld\n", (long)(stats->nreads_1st+stats->nreads_2nd+stats->nreads_other)); - fprintf(to, "SN\tis sorted:\t%d\n", stats->is_sorted ? 1 : 0); + fprintf(to, "SN\tis sorted:\t%d\t# %s by coordinate\n", stats->is_sorted ? 1 : 0, stats->is_sorted ? "sorted" : "not sorted"); fprintf(to, "SN\t1st fragments:\t%ld\n", (long)stats->nreads_1st); fprintf(to, "SN\tlast fragments:\t%ld\n", (long)stats->nreads_2nd); fprintf(to, "SN\treads mapped:\t%ld\n", (long)(stats->nreads_paired_and_mapped+stats->nreads_single_mapped)); @@ -1823,47 +1826,50 @@ void output_stats(FILE *to, stats_t *stats, int sparse) fprintf(to, "IC\t%d\t%ld\t%ld\t%ld\t%ld\n", ilen+1, (long)stats->ins_cycles_1st[ilen], (long)stats->ins_cycles_2nd[ilen], (long)stats->del_cycles_1st[ilen], (long)stats->del_cycles_2nd[ilen]); } - fprintf(to, "# Coverage distribution. Use `grep ^COV | cut -f 2-` to extract this part.\n"); - if ( stats->cov[0] ) - fprintf(to, "COV\t[<%d]\t%d\t%ld\n",stats->info->cov_min,stats->info->cov_min-1, (long)stats->cov[0]); - for (icov=1; icovncov-1; icov++) - if ( stats->cov[icov] ) - fprintf(to, "COV\t[%d-%d]\t%d\t%ld\n",stats->info->cov_min + (icov-1)*stats->info->cov_step, stats->info->cov_min + icov*stats->info->cov_step-1,stats->info->cov_min + icov*stats->info->cov_step-1, (long)stats->cov[icov]); - if ( stats->cov[stats->ncov-1] ) - fprintf(to, "COV\t[%d<]\t%d\t%ld\n",stats->info->cov_min + (stats->ncov-2)*stats->info->cov_step-1,stats->info->cov_min + (stats->ncov-2)*stats->info->cov_step-1, (long)stats->cov[stats->ncov-1]); + if (stats->is_sorted) { + fprintf(to, "# Coverage distribution. Use `grep ^COV | cut -f 2-` to extract this part.\n"); + if ( stats->cov[0] ) + fprintf(to, "COV\t[<%d]\t%d\t%ld\n",stats->info->cov_min,stats->info->cov_min-1, (long)stats->cov[0]); + for (icov=1; icovncov-1; icov++) + if ( stats->cov[icov] ) + fprintf(to, "COV\t[%d-%d]\t%d\t%ld\n",stats->info->cov_min + (icov-1)*stats->info->cov_step, stats->info->cov_min + icov*stats->info->cov_step-1,stats->info->cov_min + icov*stats->info->cov_step-1, (long)stats->cov[icov]); + if ( stats->cov[stats->ncov-1] ) + fprintf(to, "COV\t[%d<]\t%d\t%ld\n",stats->info->cov_min + (stats->ncov-2)*stats->info->cov_step-1,stats->info->cov_min + (stats->ncov-2)*stats->info->cov_step-1, (long)stats->cov[stats->ncov-1]); - // Calculate average GC content, then sort by GC and depth - fprintf(to, "# GC-depth. Use `grep ^GCD | cut -f 2-` to extract this part. The columns are: GC%%, unique sequence percentiles, 10th, 25th, 50th, 75th and 90th depth percentile\n"); - uint32_t igcd; - for (igcd=0; igcdigcd; igcd++) - { - if ( stats->info->fai ) - stats->gcd[igcd].gc = rint(100. * stats->gcd[igcd].gc); - else - if ( stats->gcd[igcd].depth ) - stats->gcd[igcd].gc = rint(100. * stats->gcd[igcd].gc / stats->gcd[igcd].depth); - } - if ( stats->ngcd ) - qsort(stats->gcd, stats->igcd+1, sizeof(gc_depth_t), gcd_cmp); - igcd = 0; - while ( igcd < stats->igcd ) - { - // Calculate percentiles (10,25,50,75,90th) for the current GC content and print - uint32_t nbins=0, itmp=igcd; - float gc = stats->gcd[igcd].gc; - while ( itmpigcd && fabs(stats->gcd[itmp].gc-gc)<0.1 ) + + // Calculate average GC content, then sort by GC and depth + fprintf(to, "# GC-depth. Use `grep ^GCD | cut -f 2-` to extract this part. The columns are: GC%%, unique sequence percentiles, 10th, 25th, 50th, 75th and 90th depth percentile\n"); + uint32_t igcd; + for (igcd=0; igcdigcd; igcd++) + { + if ( stats->info->fai ) + stats->gcd[igcd].gc = rint(100. * stats->gcd[igcd].gc); + else + if ( stats->gcd[igcd].depth ) + stats->gcd[igcd].gc = rint(100. * stats->gcd[igcd].gc / stats->gcd[igcd].depth); + } + if ( stats->ngcd ) + qsort(stats->gcd, stats->igcd+1, sizeof(gc_depth_t), gcd_cmp); + igcd = 0; + while ( igcd < stats->igcd ) { - nbins++; - itmp++; + // Calculate percentiles (10,25,50,75,90th) for the current GC content and print + uint32_t nbins=0, itmp=igcd; + float gc = stats->gcd[igcd].gc; + while ( itmpigcd && fabs(stats->gcd[itmp].gc-gc)<0.1 ) + { + nbins++; + itmp++; + } + fprintf(to, "GCD\t%.1f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\n", gc, (igcd+nbins+1)*100./(stats->igcd+1), + gcd_percentile(&(stats->gcd[igcd]),nbins,10) *avg_read_length/stats->info->gcd_bin_size, + gcd_percentile(&(stats->gcd[igcd]),nbins,25) *avg_read_length/stats->info->gcd_bin_size, + gcd_percentile(&(stats->gcd[igcd]),nbins,50) *avg_read_length/stats->info->gcd_bin_size, + gcd_percentile(&(stats->gcd[igcd]),nbins,75) *avg_read_length/stats->info->gcd_bin_size, + gcd_percentile(&(stats->gcd[igcd]),nbins,90) *avg_read_length/stats->info->gcd_bin_size + ); + igcd += nbins; } - fprintf(to, "GCD\t%.1f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\n", gc, (igcd+nbins+1)*100./(stats->igcd+1), - gcd_percentile(&(stats->gcd[igcd]),nbins,10) *avg_read_length/stats->info->gcd_bin_size, - gcd_percentile(&(stats->gcd[igcd]),nbins,25) *avg_read_length/stats->info->gcd_bin_size, - gcd_percentile(&(stats->gcd[igcd]),nbins,50) *avg_read_length/stats->info->gcd_bin_size, - gcd_percentile(&(stats->gcd[igcd]),nbins,75) *avg_read_length/stats->info->gcd_bin_size, - gcd_percentile(&(stats->gcd[igcd]),nbins,90) *avg_read_length/stats->info->gcd_bin_size - ); - igcd += nbins; } } diff --git a/samtools/version.sh b/samtools/version.sh index 618a098a..20a41712 100755 --- a/samtools/version.sh +++ b/samtools/version.sh @@ -24,7 +24,7 @@ # DEALINGS IN THE SOFTWARE. # Master version, for use in tarballs or non-git source copies -VERSION=1.21 +VERSION=1.22 # If we have a git clone, then check against the current tag if [ -e .git ] diff --git a/setup.py b/setup.py index 4952bb39..8465afb0 100644 --- a/setup.py +++ b/setup.py @@ -61,8 +61,9 @@ def changedir(path): def run_configure(option): sys.stdout.flush() try: + # Always disable ref-cache as its code is omitted from pysam's htslib/ retcode = subprocess.call( - " ".join(("./configure", option)), + " ".join(("./configure", "--disable-ref-cache", option)), shell=True) if retcode != 0: return False