diff --git a/.travis.yml b/.travis.yml index 51dd553..befe639 100644 --- a/.travis.yml +++ b/.travis.yml @@ -4,6 +4,11 @@ os: language: c +stages: + - test + - name: deploy + if: tag IS present + env: matrix: - CONDA_PY=3.6 @@ -14,53 +19,68 @@ env: - TWINE_USERNAME=grepall - secure: bTbky3Un19NAl62lix8bMLmBv9IGNhFkRXlZH+B253nYub7jwQwPQKum3ct9ea+XHJT5//uM0B8WAF6eyugpNkPQ7+S7SEH5BJuCt30nv6qvGhSO2AffZKeHEDnfW2kqGrivn87TqeomlSBlO742CD/V0wOIUwkTT9tutd+E7FU= -_deploy_common: &deploy_common - if: branch = master AND type = push AND fork = false +_cibw_common: &cibw_common + addons: {} install: - - python3 -m pip install cibuildwheel twine + - python3 -m pip install cibuildwheel>=1.1.0 twine + script: + - set -e + - cibuildwheel --output-dir dist + - twine check dist/* + - twine upload --skip-existing dist/* + +_cibw_linux: &cibw_linux + stage: deploy + os: linux + language: python + python: '3.5' + services: + - docker + <<: *cibw_common matrix: -# include: -# - stage: deploy -# os: linux -# language: python -# python: '3.5' -# services: -# - docker -# env: -# - CIBW_BEFORE_BUILD="yum install -y zlib-devel bzip2-devel xz-devel && pip install -r requirements.txt" -# - CIBW_ENVIRONMENT='HTSLIB_CONFIGURE_OPTIONS="--disable-libcurl"' -# - CIBW_TEST_COMMAND='python -c "import pysam"' -# addons: -# apt: -# packages: -# - gcc -# - g++ -# - libcurl4-openssl-dev # for libcurl support in sdist -# - libssl-dev # for s3 support in sdist -# <<: *deploy_common -# script: -# - set -e -# - cibuildwheel --output-dir dist -# - python3 -m pip install Cython -# - python3 setup.py build_ext --inplace -# - python3 setup.py sdist -# - twine check dist/* -# # - twine upload --skip-existing dist/* -# - stage: deploy -# os: osx -# language: generic -# env: -# - CIBW_BEFORE_BUILD="pip install -r requirements.txt" -# - CIBW_ENVIRONMENT='HTSLIB_CONFIGURE_OPTIONS="--disable-libcurl"' -# - CIBW_TEST_COMMAND='python -c "import pysam"' -# addons: {} -# <<: *deploy_common -# script: -# - set -e -# - cibuildwheel --output-dir dist -# - twine check dist/* -# # - twine upload --skip-existing dist/* + include: + - stage: deploy + os: linux + language: python + python: '3.5' + addons: + apt: + packages: + - gcc + - g++ + - libcurl4-openssl-dev # for libcurl support in sdist + - libssl-dev # for s3 support in sdist + install: + - python3 -m pip install Cython twine + script: + - set -e + - python3 setup.py build_ext --inplace + - python3 setup.py sdist + - twine check dist/* + - twine upload --skip-existing dist/* + - <<: *cibw_linux + env: + - CIBW_BUILD="*_x86_64" + - CIBW_BEFORE_BUILD="yum install -y zlib-devel bzip2-devel xz-devel && python -m pip install -r requirements.txt" + - CIBW_ENVIRONMENT='HTSLIB_CONFIGURE_OPTIONS="--disable-libcurl"' + - CIBW_REPAIR_WHEEL_COMMAND_LINUX='auditwheel repair -L . -w {dest_dir} {wheel}' + - CIBW_TEST_COMMAND='python -c "import pysam"' + - <<: *cibw_linux + env: + - CIBW_BUILD="*_i686" + - CIBW_BEFORE_BUILD="yum install -y zlib-devel bzip2-devel xz-devel && python -m pip install -r requirements.txt" + - CIBW_ENVIRONMENT='HTSLIB_CONFIGURE_OPTIONS="--disable-libcurl"' + - CIBW_REPAIR_WHEEL_COMMAND_LINUX='auditwheel repair -L . -w {dest_dir} {wheel}' + - CIBW_TEST_COMMAND='python -c "import pysam"' + - stage: deploy + os: osx + language: generic + env: + - CIBW_BEFORE_BUILD="python -m pip install -r requirements.txt" + - CIBW_ENVIRONMENT='HTSLIB_CONFIGURE_OPTIONS="--disable-libcurl"' + - CIBW_TEST_COMMAND='python -c "import pysam"' + <<: *cibw_common addons: apt: diff --git a/devtools/import.py b/devtools/import.py index 7430cc5..91aae46 100644 --- a/devtools/import.py +++ b/devtools/import.py @@ -1,27 +1,11 @@ ################################################################# -# Importing samtools and htslib +# Importing samtools, bcftools, and htslib # -# For htslib, simply copy the whole release tar-ball -# into the directory "htslib" and recreate the file version.h +# For each package PKG: # -# rm -rf htslib -# mv download/htslib htslib -# git checkout -- htslib/version.h -# Edit the file htslib/version.h to set the right version number. -# -# For samtools, type: -# rm -rf samtools -# python import.py samtools download/samtools -# git checkout -- samtools/version.h -# -# Manually, then: -# modify config.h to set compatibility flags -# -# For bcftools, type: -# rm -rf bcftools -# python import.py bcftools download/bedtools -# git checkout -- bcftools/version.h -# rm -rf bedtools/test bedtools/plugins +# rm -rf PKG +# python import.py PKG path/to/download/PKG-X.Y +# git checkout -- PKG/version.h import fnmatch import os @@ -34,23 +18,19 @@ EXCLUDE = { "samtools": ( + "test", "misc", "razip.c", "bgzip.c", "main.c", "calDepth.c", "bam2bed.c", - "wgsim.c", "bam_tview.c", "bam_tview.h", "bam_tview_html.c", "bam_tview_curses.c", - "md5fa.c", - "md5sum-lite.c", - "maq2sam.c", "bamcheck.c", "chk_indel.c", "vcf-miniview.c", - "hfile_irods.c", # requires irods library ), "bcftools": ( "test", "plugins", "peakfit.c", @@ -60,7 +40,8 @@ "polysomy.c"), "htslib": ( 'htslib/tabix.c', 'htslib/bgzip.c', - 'htslib/htsfile.c', 'htslib/hfile_irods.c'), + 'htslib/htsfile.c', + "test"), } @@ -71,13 +52,21 @@ -def locate(pattern, root=os.curdir): - '''Locate all files matching supplied filename pattern in and below - supplied root directory. +def locate(pattern, root=os.curdir, exclude=[], exclude_htslib=False): + '''Locate all files matching supplied filename pattern (but not listed + in exclude) in and below the supplied root directory. Omit any files under + directories listed in exclude or (if exclude_htslib=True) matching /htslib-*/. ''' for path, dirs, files in os.walk(os.path.abspath(root)): for filename in fnmatch.filter(files, pattern): - yield os.path.join(path, filename) + if filename not in exclude: + yield os.path.join(path, filename) + + for dirname in exclude: + if dirname in dirs: dirs.remove(dirname) + if exclude_htslib: + for dirname in [d for d in dirs if re.match(r"htslib-", d)]: + dirs.remove(dirname) def _update_pysam_files(cf, destdir): @@ -149,16 +138,15 @@ def _update_pysam_files(cf, destdir): raise IOError( "source directory `%s` does not exist." % srcdir) - cfiles = locate("*.c", srcdir) - hfiles = locate("*.h", srcdir) + cfiles = locate("*.c", srcdir, exclude=exclude, exclude_htslib=True) + hfiles = locate("*.h", srcdir, exclude=exclude, exclude_htslib=True) mfiles = itertools.chain(locate("README", srcdir), locate("LICENSE", srcdir)) - - # remove unwanted files and htslib subdirectory. - cfiles = [x for x in cfiles if os.path.basename(x) not in exclude - and not re.search("htslib-", x)] - hfiles = [x for x in hfiles if os.path.basename(x) not in exclude - and not re.search("htslib-", x)] + if dest == "htslib": + # Add build files, including *.ac *.in *.mk *.m4 + mfiles = itertools.chain(mfiles, locate("Makefile", srcdir), + locate("configure", srcdir), + locate("*.[aim][cnk4]", srcdir)) ncopied = 0 @@ -203,10 +191,12 @@ def _compareAndCopy(src, srcdir, destdir, exclude): sys.stdout.write( "installed latest source code from %s: " "%i files copied\n" % (srcdir, ncopied)) - # redirect stderr to pysamerr and replace bam.h with a stub. - sys.stdout.write("applying stderr redirection\n") - _update_pysam_files(cf, destdir) + if dest in MAIN: + # redirect stderr to pysamerr and replace bam.h with a stub. + sys.stdout.write("applying stderr redirection\n") + + _update_pysam_files(cf, destdir) sys.exit(0) diff --git a/devtools/run_tests_travis.sh b/devtools/run_tests_travis.sh index b20c8a0..9ad41a7 100755 --- a/devtools/run_tests_travis.sh +++ b/devtools/run_tests_travis.sh @@ -17,9 +17,9 @@ WORKDIR=`pwd` #Install miniconda python if [ $TRAVIS_OS_NAME == "osx" ]; then - wget https://repo.continuum.io/miniconda/Miniconda3-latest-MacOSX-x86_64.sh -O Miniconda3.sh + wget -q https://repo.continuum.io/miniconda/Miniconda3-latest-MacOSX-x86_64.sh -O Miniconda3.sh else - wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O Miniconda3.sh --no-check-certificate # Default OS versions are old and have SSL / CERT issues + wget -q https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O Miniconda3.sh --no-check-certificate # Default OS versions are old and have SSL / CERT issues fi bash Miniconda3.sh -b @@ -40,9 +40,10 @@ conda config --add channels conda-forge # NB: we force conda-forge:ncurses due to bioconda/bioconda-recipes#13488 conda install -y "samtools=1.9" "bcftools=1.9" "htslib=1.9" xz curl bzip2 conda-forge:ncurses -# Need to make C compiler and linker use the anaconda includes and libraries: -export PREFIX=~/miniconda3/ -export CFLAGS="-I${PREFIX}/include -L${PREFIX}/lib" +# As HTSLIB_MODE is (defaulted to) 'shared', ensure we don't pick up +# the external headers from the Conda-installed htslib package. +mv $CONDA_PREFIX/include/htslib $CONDA_PREFIX/include/htslib-disable + export HTSLIB_CONFIGURE_OPTIONS="--disable-libcurl" echo "show samtools, htslib, and bcftools versions" diff --git a/doc/faq.rst b/doc/faq.rst index e07e1cf..dc0898e 100644 --- a/doc/faq.rst +++ b/doc/faq.rst @@ -40,7 +40,7 @@ parts have not been fully tested. A related issue is when different threads read from the same file objec - or the same thread uses two iterators over a file. There is only a single file-position for each opened file. To prevent this from -hapeding, use the option ``mulitple_iterator=True`` when calling +hapeding, use the option ``multiple_iterator=True`` when calling a fetch() method. This will return an iterator on a newly opened file. @@ -109,7 +109,7 @@ Here, the output is:: The reason for this behaviour is that every iterator needs to keep track of its current position in the file. Within pysam, each opened file can only keep track of one file position and hence there can only -be one iterator per file. Using the option ``mulitple_iterators=True`` +be one iterator per file. Using the option ``multiple_iterators=True`` will return an iterator within a newly opened file. This iterator will not interfere with existing iterators as it has its own file handle associated with it. diff --git a/doc/release.rst b/doc/release.rst index ee1875b..07d099d 100644 --- a/doc/release.rst +++ b/doc/release.rst @@ -2,6 +2,26 @@ Release notes ============= +Release 0.15.4 +============== + +Bugfix release. Principal reason for release is to update cython +version in order to fix pip install pysam with python 3.8. + +* [#879] Fix add_meta function in libcbcf.pyx, so meta-information + lines in header added with this function have double-quoting rules + in accordance to rules specified in VCF4.2 and VCF4.3 specifications +* [#863] Force arg to bytes to support non-ASCII encoding +* [#875] Bump minimum Cython version +* [#868] Prevent segfault on Python 2.7 AlignedSegment.compare(other=None) +* [#867] Fix wheel building on TravisCI +* [#863] Force arg to bytes to support non-ASCII encoding +* [#799] disambiguate interpretation of bcf_read return code +* [#841] Fix silent truncation of FASTQ with bad q strings +* [#846] Prevent segmentation fault on ID, when handling malformed records +* [#829] Run configure with the correct CC/CFLAGS/LDFLAGS env vars + + Release 0.15.3 ============== diff --git a/pysam/Pileup.py b/pysam/Pileup.py deleted file mode 100644 index 1fe05ec..0000000 --- a/pysam/Pileup.py +++ /dev/null @@ -1,282 +0,0 @@ -'''Tools for working with files in the samtools pileup -c format.''' -import collections -import pysam - -PileupSubstitution = collections.namedtuple("PileupSubstitution", - " ".join(( - "chromosome", - "pos", - "reference_base", - "genotype", - "consensus_quality", - "snp_quality", - "mapping_quality", - "coverage", - "read_bases", - "base_qualities"))) - -PileupIndel = collections.namedtuple("PileupIndel", - " ".join(( - "chromosome", - "pos", - "reference_base", - "genotype", - "consensus_quality", - "snp_quality", - "mapping_quality", - "coverage", - "first_allele", - "second_allele", - "reads_first", - "reads_second", - "reads_diff"))) - - -def iterate(infile): - '''iterate over ``samtools pileup -c`` formatted file. - - *infile* can be any iterator over a lines. - - The function yields named tuples of the type :class:`pysam.Pileup.PileupSubstitution` - or :class:`pysam.Pileup.PileupIndel`. - - .. note:: - - The parser converts to 0-based coordinates - ''' - - conv_subst = (str, lambda x: int(x) - 1, str, - str, int, int, int, int, str, str) - conv_indel = (str, lambda x: int(x) - 1, str, str, int, - int, int, int, str, str, int, int, int) - - for line in infile: - d = line[:-1].split() - if d[2] == "*": - try: - yield PileupIndel(*[x(y) for x, y in zip(conv_indel, d)]) - except TypeError: - raise pysam.SamtoolsError("parsing error in line: `%s`" % line) - else: - try: - yield PileupSubstitution(*[x(y) for x, y in zip(conv_subst, d)]) - except TypeError: - raise pysam.SamtoolsError("parsing error in line: `%s`" % line) - - -ENCODE_GENOTYPE = { - 'A': 'A', 'C': 'C', 'G': 'G', 'T': 'T', - 'AA': 'A', 'CC': 'C', 'GG': 'G', 'TT': 'T', 'UU': 'U', - 'AG': 'r', 'GA': 'R', - 'CT': 'y', 'TC': 'Y', - 'AC': 'm', 'CA': 'M', - 'GT': 'k', 'TG': 'K', - 'CG': 's', 'GC': 'S', - 'AT': 'w', 'TA': 'W', -} - -DECODE_GENOTYPE = { - 'A': 'AA', - 'C': 'CC', - 'G': 'GG', - 'T': 'TT', - 'r': 'AG', 'R': 'AG', - 'y': 'CT', 'Y': 'CT', - 'm': 'AC', 'M': 'AC', - 'k': 'GT', 'K': 'GT', - 's': 'CG', 'S': 'CG', - 'w': 'AT', 'W': 'AT', -} - -# ------------------------------------------------------------ - - -def encodeGenotype(code): - '''encode genotypes like GG, GA into a one-letter code. - The returned code is lower case if code[0] < code[1], otherwise - it is uppercase. - ''' - return ENCODE_GENOTYPE[code.upper()] - - -def decodeGenotype(code): - '''decode single letter genotypes like m, M into two letters. - This is the reverse operation to :meth:`encodeGenotype`. - ''' - return DECODE_GENOTYPE[code] - - -def translateIndelGenotypeFromVCF(vcf_genotypes, ref): - '''translate indel from vcf to pileup format.''' - - # indels - def getPrefix(s1, s2): - '''get common prefix of strings s1 and s2.''' - n = min(len(s1), len(s2)) - for x in range(n): - if s1[x] != s2[x]: - return s1[:x] - return s1[:n] - - def getSuffix(s1, s2): - '''get common sufix of strings s1 and s2.''' - n = min(len(s1), len(s2)) - if s1[-1] != s2[-1]: - return "" - for x in range(-2, -n - 1, -1): - if s1[x] != s2[x]: - return s1[x + 1:] - return s1[-n:] - - def getGenotype(variant, ref): - - if variant == ref: - return "*", 0 - - if len(ref) > len(variant): - # is a deletion - if ref.startswith(variant): - return "-%s" % ref[len(variant):], len(variant) - 1 - elif ref.endswith(variant): - return "-%s" % ref[:-len(variant)], -1 - else: - prefix = getPrefix(ref, variant) - suffix = getSuffix(ref, variant) - shared = len(prefix) + len(suffix) - len(variant) - # print "-", prefix, suffix, ref, variant, shared, len(prefix), len(suffix), len(ref) - if shared < 0: - raise ValueError() - return "-%s" % ref[len(prefix):-(len(suffix) - shared)], len(prefix) - 1 - - elif len(ref) < len(variant): - # is an insertion - if variant.startswith(ref): - return "+%s" % variant[len(ref):], len(ref) - 1 - elif variant.endswith(ref): - return "+%s" % variant[:len(ref)], 0 - else: - prefix = getPrefix(ref, variant) - suffix = getSuffix(ref, variant) - shared = len(prefix) + len(suffix) - len(ref) - if shared < 0: - raise ValueError() - - return "+%s" % variant[len(prefix):-(len(suffix) - shared)], len(prefix) - else: - assert 0, "snp?" - - # in pileup, the position refers to the base - # after the coordinate, hence subtract 1 - # pos -= 1 - - genotypes, offsets = [], [] - is_error = True - - for variant in vcf_genotypes: - try: - g, offset = getGenotype(variant, ref) - except ValueError: - break - - genotypes.append(g) - if g != "*": - offsets.append(offset) - - else: - is_error = False - - if is_error: - raise ValueError() - - assert len(set(offsets)) == 1, "multiple offsets for indel" - offset = offsets[0] - - genotypes = "/".join(genotypes) - return genotypes, offset - - -def vcf2pileup(vcf, sample): - '''convert vcf record to pileup record.''' - - chromosome = vcf.contig - pos = vcf.pos - reference = vcf.ref - allelles = [reference] + vcf.alt - - data = vcf[sample] - - # get genotype - genotypes = data["GT"] - if len(genotypes) > 1: - raise ValueError("only single genotype per position, %s" % (str(vcf))) - - genotypes = genotypes[0] - - # not a variant - if genotypes[0] == ".": - return None - - genotypes = [allelles[int(x)] for x in genotypes if x != "/"] - - # snp_quality is "genotype quality" - snp_quality = consensus_quality = data.get("GQ", [0])[0] - mapping_quality = vcf.info.get("MQ", [0])[0] - coverage = data.get("DP", 0) - - if len(reference) > 1 or max([len(x) for x in vcf.alt]) > 1: - # indel - genotype, offset = translateIndelGenotypeFromVCF(genotypes, reference) - - return PileupIndel(chromosome, - pos + offset, - "*", - genotype, - consensus_quality, - snp_quality, - mapping_quality, - coverage, - genotype, - "<" * len(genotype), - 0, - 0, - 0) - - else: - genotype = encodeGenotype("".join(genotypes)) - read_bases = "" - base_qualities = "" - - return PileupSubstitution(chromosome, pos, reference, - genotype, consensus_quality, - snp_quality, mapping_quality, - coverage, read_bases, - base_qualities) - - -def iterate_from_vcf(infile, sample): - '''iterate over a vcf-formatted file. - - *infile* can be any iterator over a lines. - - The function yields named tuples of the type - :class:`pysam.Pileup.PileupSubstitution` or - :class:`pysam.Pileup.PileupIndel`. - - Positions without a snp will be skipped. - - This method is wasteful and written to support same legacy code - that expects samtools pileup output. - - Better use the vcf parser directly. - - ''' - vcf = pysam.VCF() - vcf.connect(infile) - - if sample not in vcf.getsamples(): - raise KeyError("sample %s not vcf file") - - for row in vcf.fetch(): - result = vcf2pileup(row, sample) - if result: - yield result diff --git a/pysam/VCF.py.obsolete b/pysam/VCF.py.obsolete deleted file mode 100644 index 78e6220..0000000 --- a/pysam/VCF.py.obsolete +++ /dev/null @@ -1,1087 +0,0 @@ -# -# Code to read, write and edit VCF files -# -# VCF lines are encoded as a dictionary with these keys (note: all lowercase): -# 'chrom': string -# 'pos': integer -# 'id': string -# 'ref': string -# 'alt': list of strings -# 'qual': integer -# 'filter': None (missing value), or list of keys (strings); empty list parsed as ["PASS"] -# 'info': dictionary of values (see below) -# 'format': list of keys (strings) -# sample keys: dictionary of values (see below) -# -# The sample keys are accessible through vcf.getsamples() -# -# A dictionary of values contains value keys (defined in ##INFO or ##FORMAT lines) which map -# to a list, containign integers, floats, strings, or characters. Missing values are replaced -# by a particular value, often -1 or . -# -# Genotypes are not stored as a string, but as a list of 1 or 3 elements (for haploid and diploid samples), -# the first (and last) the integer representing an allele, and the second the separation character. -# Note that there is just one genotype per sample, but for consistency the single element is stored in a list. -# -# Header lines other than ##INFO, ##FORMAT and ##FILTER are stored as (key, value) pairs and are accessible -# through getheader() -# -# The VCF class can be instantiated with a 'regions' variable consisting of tuples (chrom,start,end) encoding -# 0-based half-open segments. Only variants with a position inside the segment will be parsed. A regions -# parser is available under parse_regions. -# -# When instantiated, a reference can be passed to the VCF class. This may be any class that supports a -# fetch(chrom, start, end) method. -# -# -# -# NOTE: the position that is returned to Python is 0-based, NOT 1-based as in the VCF file. -# -# -# -# TODO: -# only v4.0 writing is complete; alleles are not converted to v3.3 format -# - -from collections import namedtuple, defaultdict -from operator import itemgetter -import sys, re, copy, bisect - -import pysam - -gtsRegEx = re.compile("[|/\\\\]") -alleleRegEx = re.compile('^[ACGTN]+$') - -# Utility function. Uses 0-based coordinates -def get_sequence(chrom, start, end, fa): - # obtain sequence from .fa file, without truncation - if end<=start: return "" - if not fa: return "N"*(end-start) - if start<0: return "N"*(-start) + get_sequence(chrom, 0, end, fa).upper() - sequence = fa.fetch(chrom, start, end).upper() - if len(sequence) < end-start: sequence += "N"*(end-start-len(sequence)) - return sequence - -# Utility function. Parses a region string -def parse_regions( string ): - result = [] - for r in string.split(','): - elts = r.split(':') - chrom, start, end = elts[0], 0, 3000000000 - if len(elts)==1: pass - elif len(elts)==2: - if len(elts[1])>0: - ielts = elts[1].split('-') - if len(ielts) != 2: ValueError("Don't understand region string '%s'" % r) - try: start, end = int(ielts[0])-1, int(ielts[1]) - except: raise ValueError("Don't understand region string '%s'" % r) - else: - raise ValueError("Don't understand region string '%s'" % r) - result.append( (chrom,start,end) ) - return result - - -FORMAT = namedtuple('FORMAT','id numbertype number type description missingvalue') - -########################################################################################################### -# -# New class -# -########################################################################################################### - -class VCFRecord(object): - '''vcf record. - - initialized from data and vcf meta - ''' - - data = None - vcf = None - - def __init__(self, data, vcf): - self.data, self.vcf = data, vcf - - if len(data) != len(self.vcf._samples): - self.error(str(data), - self.BAD_NUMBER_OF_COLUMNS, - "expected %s for %s samples (%s), got %s" % \ - (len(self.vcf._samples), - len(self.vcf._samples), - self.vcf._samples, - len(data))) - - property contig: - def __get__( self ): return self.data[0] - - property pos: - def __get__( self ): - return self.data.pos - - property id: - def __get__( self ): return self.data[2] - - property ref: - def __get__(self ): - # note: gerton substitutes reference if it can be fixed. - return self.data[3].upper() - - property alt: - def __get__(self): - # convert v3.3 to v4.0 alleles below - alt = self.data[4] - if alt == ".": alt = [] - else: alt = alt.upper().split(',') - return alt - - property qual: - def __get__(self): - qual = self.data[5] - if qual == ".": qual = -1 - else: - try: qual = float(qual) - except: self.error(line,self.QUAL_NOT_NUMERICAL) - - property filter: - def __get__(self): - # postpone checking that filters exist. Encode missing filter or no filtering as empty list - if cols[6] == "." or cols[6] == "PASS" or cols[6] == "0": filter = [] - else: filter = cols[6].split(';') - - return filter - - property info: - def __get__(self): - col = self.data[7] - # dictionary of keys, and list of values - info = {} - if col != ".": - for blurp in col.split(';'): - elts = blurp.split('=') - if len(elts) == 1: v = None - elif len(elts) == 2: v = elts[1] - else: self.error(str(self.data),self.ERROR_INFO_STRING) - info[elts[0]] = self.parse_formatdata(elts[0], v, self.vcf._info, line) - return info - - property format: - def __get__(self): - return self.data[8].split(':') - - def __getitem__(self, key): - - # parse sample columns - values = self.data[self.vcf._sample2column[key]].split(':') - alt = self.alt - format = self.format - - if len(values) > len(format): - self.error(line,self.BAD_NUMBER_OF_VALUES,"(found %s values in element %s; expected %s)" % (len(values),sample,len(format))) - - result = {} - for idx in range(len(format)): - expected = self.vcf.get_expected(format[idx], self.vcf._format, alt) - if idx < len(values): value = values[idx] - else: - if expected == -1: value = "." - else: value = ",".join(["."]*expected) - - result[format[idx]] = self.vcf.parse_formatdata(format[idx], value, self.vcf._format, line) - if expected != -1 and len(result[format[idx]]) != expected: - self.error(str(self.data),self.BAD_NUMBER_OF_PARAMETERS, - "id=%s, expected %s parameters, got %s" % (format[idx],expected,result[format[idx]])) - if len(result[format[idx]] ) < expected: result[format[idx]] += [result[format[idx]][-1]]*(expected-len(result[format[idx]])) - result[format[idx]] = result[format[idx]][:expected] - - return result - - def __str__(self): - return str(self.data) - -class VCF: - - # types - NT_UNKNOWN = 0 - NT_NUMBER = 1 - NT_ALLELES = 2 - NT_NR_ALLELES = 3 - NT_GENOTYPES = 4 - NT_PHASED_GENOTYPES = 5 - - _errors = { 0:"UNKNOWN_FORMAT_STRING:Unknown file format identifier", - 1:"BADLY_FORMATTED_FORMAT_STRING:Formatting error in the format string", - 2:"BADLY_FORMATTED_HEADING:Did not find 9 required headings (CHROM, POS, ..., FORMAT) %s", - 3:"BAD_NUMBER_OF_COLUMNS:Wrong number of columns found (%s)", - 4:"POS_NOT_NUMERICAL:Position column is not numerical", - 5:"UNKNOWN_CHAR_IN_REF:Unknown character in reference field", - 6:"V33_BAD_REF:Reference should be single-character in v3.3 VCF", - 7:"V33_BAD_ALLELE:Cannot interpret allele for v3.3 VCF", - 8:"POS_NOT_POSITIVE:Position field must be >0", - 9:"QUAL_NOT_NUMERICAL:Quality field must be numerical, or '.'", - 10:"ERROR_INFO_STRING:Error while parsing info field", - 11:"ERROR_UNKNOWN_KEY:Unknown key (%s) found in formatted field (info; format; or filter)", - 12:"ERROR_FORMAT_NOT_NUMERICAL:Expected integer or float in formatted field; got %s", - 13:"ERROR_FORMAT_NOT_CHAR:Eexpected character in formatted field; got string", - 14:"FILTER_NOT_DEFINED:Identifier (%s) in filter found which was not defined in header", - 15:"FORMAT_NOT_DEFINED:Identifier (%s) in format found which was not defined in header", - 16:"BAD_NUMBER_OF_VALUES:Found too many of values in sample column (%s)", - 17:"BAD_NUMBER_OF_PARAMETERS:Found unexpected number of parameters (%s)", - 18:"BAD_GENOTYPE:Cannot parse genotype (%s)", - 19:"V40_BAD_ALLELE:Bad allele found for v4.0 VCF (%s)", - 20:"MISSING_REF:Reference allele missing", - 21:"V33_UNMATCHED_DELETION:Deleted sequence does not match reference (%s)", - 22:"V40_MISSING_ANGLE_BRACKETS:Format definition is not deliminted by angular brackets", - 23:"FORMAT_MISSING_QUOTES:Description field in format definition is not surrounded by quotes", - 24:"V40_FORMAT_MUST_HAVE_NAMED_FIELDS:Fields in v4.0 VCF format definition must have named fields", - 25:"HEADING_NOT_SEPARATED_BY_TABS:Heading line appears separated by spaces, not tabs", - 26:"WRONG_REF:Wrong reference %s", - 27:"ERROR_TRAILING_DATA:Numerical field ('%s') has semicolon-separated trailing data", - 28:"BAD_CHR_TAG:Error calculating chr tag for %s", - 29:"ZERO_LENGTH_ALLELE:Found zero-length allele", - 30:"MISSING_INDEL_ALLELE_REF_BASE:Indel alleles must begin with single reference base" - } - - # tag-value pairs; tags are not unique; does not include fileformat, INFO, FILTER or FORMAT fields - _header = [] - - # version number; 33=v3.3; 40=v4.0 - _version = 40 - - # info, filter and format data - _info = {} - _filter = {} - _format = {} - - # header; and required columns - _required = ["CHROM","POS","ID","REF","ALT","QUAL","FILTER","INFO","FORMAT"] - _samples = [] - - # control behaviour - _ignored_errors = set([11]) # ERROR_UNKNOWN_KEY - _warn_errors = set([]) - _leftalign = False - - # reference sequence - _reference = None - - # regions to include; None includes everything - _regions = None - - # statefull stuff - _lineno = -1 - _line = None - _lines = None - - def __init__(self, _copy=None, reference=None, regions=None, lines=None, leftalign=False): - # make error identifiers accessible by name - for id in self._errors.keys(): self.__dict__[self._errors[id].split(':')[0]] = id - if _copy != None: - self._leftalign = _copy._leftalign - self._header = _copy._header[:] - self._version = _copy._version - self._info = copy.deepcopy(_copy._info) - self._filter = copy.deepcopy(_copy._filter) - self._format = copy.deepcopy(_copy._format) - self._samples = _copy._samples[:] - self._sample2column = copy.deepcopy(_copy._sample2column) - self._ignored_errors = copy.deepcopy(_copy._ignored_errors) - self._warn_errors = copy.deepcopy(_copy._warn_errors) - self._reference = _copy._reference - self._regions = _copy._regions - if reference: self._reference = reference - if regions: self._regions = regions - if leftalign: self._leftalign = leftalign - self._lines = lines - - def error(self,line,error,opt=None): - if error in self._ignored_errors: return - errorlabel, errorstring = self._errors[error].split(':') - if opt: errorstring = errorstring % opt - errwarn = ["Error","Warning"][error in self._warn_errors] - sys.stderr.write("Line %s: '%s'\n%s %s: %s\n" % (self._lineno,line,errwarn,errorlabel,errorstring)) - if error in self._warn_errors: return - raise ValueError(errorstring) - - def parse_format(self,line,format,filter=False): - if self._version >= 40: - if not format.startswith('<'): - self.error(line,self.V40_MISSING_ANGLE_BRACKETS) - format = "<"+format - if not format.endswith('>'): - self.error(line,self.V40_MISSING_ANGLE_BRACKETS) - format += ">" - format = format[1:-1] - data = {'id':None,'number':None,'type':None,'descr':None} - idx = 0 - while len(format.strip())>0: - elts = format.strip().split(',') - first, rest = elts[0], ','.join(elts[1:]) - if first.find('=') == -1 or (first.find('"')>=0 and first.find('=') > first.find('"')): - if self._version >= 40: self.error(line,self.V40_FORMAT_MUST_HAVE_NAMED_FIELDS) - if idx == 4: self.error(line,self.BADLY_FORMATTED_FORMAT_STRING) - first = ["ID=","Number=","Type=","Description="][idx] + first - if first.startswith('ID='): data['id'] = first.split('=')[1] - elif first.startswith('Number='): data['number'] = first.split('=')[1] - elif first.startswith('Type='): data['type'] = first.split('=')[1] - elif first.startswith('Description='): - elts = format.split('"') - if len(elts)<3: - self.error(line,self.FORMAT_MISSING_QUOTES) - elts = first.split('=') + [rest] - data['descr'] = elts[1] - rest = '"'.join(elts[2:]) - if rest.startswith(','): rest = rest[1:] - else: - self.error(line,self.BADLY_FORMATTED_FORMAT_STRING) - format = rest - idx += 1 - if filter and idx==1: idx=3 # skip number and type fields for FILTER format strings - if not data['id']: self.error(line,self.BADLY_FORMATTED_FORMAT_STRING) - if not data['descr']: - self.error(line,self.BADLY_FORMATTED_FORMAT_STRING) - data['descr'] = '' - if not data['type'] and not data['number']: - # fine, ##filter format - return FORMAT(data['id'],self.NT_NUMBER,0,"Flag",data['descr'],'.') - if not data['type'] in ["Integer","Float","Character","String","Flag"]: - self.error(line,self.BADLY_FORMATTED_FORMAT_STRING) - # I would like a missing-value field, but it isn't there - if data['type'] in ['Integer','Float']: data['missing'] = None # Do NOT use arbitrary int/float as missing value - else: data['missing'] = '.' - if not data['number']: self.error(line,self.BADLY_FORMATTED_FORMAT_STRING) - try: - n = int(data['number']) - t = self.NT_NUMBER - except ValueError: - n = -1 - if data['number'] == '.': t = self.NT_UNKNOWN - elif data['number'] == '#alleles': t = self.NT_ALLELES - elif data['number'] == '#nonref_alleles': t = self.NT_NR_ALLELES - elif data['number'] == '#genotypes': t = self.NT_GENOTYPES - elif data['number'] == '#phased_genotypes': t = self.NT_PHASED_GENOTYPES - else: - self.error(line,self.BADLY_FORMATTED_FORMAT_STRING) - return FORMAT(data['id'],t,n,data['type'],data['descr'],data['missing']) - - - def format_format( self, fmt, filter=False ): - values = [('ID',fmt.id)] - if fmt.number != None and not filter: - if fmt.numbertype == self.NT_UNKNOWN: nmb = "." - elif fmt.numbertype == self.NT_NUMBER: nmb = str(fmt.number) - elif fmt.numbertype == self.NT_ALLELES: nmb = "#alleles" - elif fmt.numbertype == self.NT_NR_ALLELES: nmb = "#nonref_alleles" - elif fmt.numbertype == self.NT_GENOTYPES: nmb = "#genotypes" - elif fmt.numbertype == self.NT_PHASED_GENOTYPES: nmb = "#phased_genotypes" - else: - raise ValueError("Unknown number type encountered: %s" % fmt.numbertype) - values.append( ('Number',nmb) ) - values.append( ('Type', fmt.type) ) - values.append( ('Description', '"' + fmt.description + '"') ) - if self._version == 33: - format = ",".join(v for k,v in values) - else: - format = "<" + (",".join( "%s=%s" % (k,v) for (k,v) in values )) + ">" - return format - - def get_expected(self, format, formatdict, alt): - fmt = formatdict[format] - if fmt.numbertype == self.NT_UNKNOWN: return -1 - if fmt.numbertype == self.NT_NUMBER: return fmt.number - if fmt.numbertype == self.NT_ALLELES: return len(alt)+1 - if fmt.numbertype == self.NT_NR_ALLELES: return len(alt) - if fmt.numbertype == self.NT_GENOTYPES: return ((len(alt)+1)*(len(alt)+2)) // 2 - if fmt.numbertype == self.NT_PHASED_GENOTYPES: return (len(alt)+1)*(len(alt)+1) - return 0 - - - def _add_definition(self, formatdict, key, data, line ): - if key in formatdict: return - self.error(line,self.ERROR_UNKNOWN_KEY,key) - if data == None: - formatdict[key] = FORMAT(key,self.NT_NUMBER,0,"Flag","(Undefined tag)",".") - return - if data == []: data = [""] # unsure what type -- say string - if type(data[0]) == type(0.0): - formatdict[key] = FORMAT(key,self.NT_UNKNOWN,-1,"Float","(Undefined tag)",None) - return - if type(data[0]) == type(0): - formatdict[key] = FORMAT(key,self.NT_UNKNOWN,-1,"Integer","(Undefined tag)",None) - return - formatdict[key] = FORMAT(key,self.NT_UNKNOWN,-1,"String","(Undefined tag)",".") - - - # todo: trim trailing missing values - def format_formatdata( self, data, format, key=True, value=True, separator=":" ): - output, sdata = [], [] - if type(data) == type([]): # for FORMAT field, make data with dummy values - d = {} - for k in data: d[k] = [] - data = d - # convert missing values; and silently add definitions if required - for k in data: - self._add_definition( format, k, data[k], "(output)" ) - for idx,v in enumerate(data[k]): - if v == format[k].missingvalue: data[k][idx] = "." - # make sure GT comes first; and ensure fixed ordering; also convert GT data back to string - for k in data: - if k != 'GT': sdata.append( (k,data[k]) ) - sdata.sort() - if 'GT' in data: - sdata = [('GT',map(self.convertGTback,data['GT']))] + sdata - for k,v in sdata: - if v == []: v = None - if key and value: - if v != None: output.append( k+"="+','.join(map(str,v)) ) - else: output.append( k ) - elif key: output.append(k) - elif value: - if v != None: output.append( ','.join(map(str,v)) ) - else: output.append( "." ) # should not happen - # snip off trailing missing data - while len(output) > 1: - last = output[-1].replace(',','').replace('.','') - if len(last)>0: break - output = output[:-1] - return separator.join(output) - - - def enter_default_format(self): - for f in [FORMAT('GT',self.NT_NUMBER,1,'String','Genotype','.'), - FORMAT('GQ',self.NT_NUMBER,1,'Integer','Genotype Quality',-1), - FORMAT('DP',self.NT_NUMBER,1,'Integer','Read depth at this position for this sample',-1), - FORMAT('HQ',self.NT_UNKNOWN,-1,'Integer','Haplotype Quality',-1), # unknown number, since may be haploid - FORMAT('FT',self.NT_NUMBER,1,'String','Sample Genotype Filter','.')]: - if f.id not in self._format: - self._format[f.id] = f - - def parse_header( self, line ): - assert line.startswith('##') - elts = line[2:].split('=') - key = elts[0].strip() - value = '='.join(elts[1:]).strip() - if key == "fileformat": - if value == "VCFv3.3": - self._version = 33 - elif value == "VCFv4.0": - self._version = 40 - elif value == "VCFv4.1": - self._version = 41 - else: - self.error(line,self.UNKNOWN_FORMAT_STRING) - elif key == "INFO": - f = self.parse_format(line, value) - self._info[ f.id ] = f - elif key == "FILTER": - f = self.parse_format(line, value, filter=True) - self._filter[ f.id ] = f - elif key == "FORMAT": - f = self.parse_format(line, value) - self._format[ f.id ] = f - else: - # keep other keys in the header field - self._header.append( (key,value) ) - - - def write_header( self, stream ): - stream.write("##fileformat=VCFv%s.%s\n" % (self._version // 10, self._version % 10)) - for key,value in self._header: stream.write("##%s=%s\n" % (key,value)) - for var,label in [(self._info,"INFO"),(self._filter,"FILTER"),(self._format,"FORMAT")]: - for f in var.itervalues(): stream.write("##%s=%s\n" % (label,self.format_format(f,filter=(label=="FILTER")))) - - - def parse_heading( self, line ): - assert line.startswith('#') - assert not line.startswith('##') - headings = line[1:].split('\t') - if len(headings)==1 and len(line[1:].split()) >= 9: - self.error(line,self.HEADING_NOT_SEPARATED_BY_TABS) - headings = line[1:].split() - - for i,s in enumerate(self._required): - - if len(headings)<=i or headings[i] != s: - - if len(headings) <= i: - err = "(%sth entry not found)" % (i+1) - else: - err = "(found %s, expected %s)" % (headings[i],s) - - #self.error(line,self.BADLY_FORMATTED_HEADING,err) - - # allow FORMAT column to be absent - if len(headings) == 8: - headings.append("FORMAT") - else: - self.error(line,self.BADLY_FORMATTED_HEADING,err) - - self._samples = headings[9:] - self._sample2column = dict( [(y,x) for x,y in enumerate( self._samples ) ] ) - - def write_heading( self, stream ): - stream.write("#" + "\t".join(self._required + self._samples) + "\n") - - def convertGT(self, GTstring): - if GTstring == ".": return ["."] - try: - gts = gtsRegEx.split(GTstring) - if len(gts) == 1: return [int(gts[0])] - if len(gts) != 2: raise ValueError() - if gts[0] == "." and gts[1] == ".": return [gts[0],GTstring[len(gts[0]):-len(gts[1])],gts[1]] - return [int(gts[0]),GTstring[len(gts[0]):-len(gts[1])],int(gts[1])] - except ValueError: - self.error(self._line,self.BAD_GENOTYPE,GTstring) - return [".","|","."] - - - def convertGTback(self, GTdata): - return ''.join(map(str,GTdata)) - - def parse_formatdata( self, key, value, formatdict, line ): - # To do: check that the right number of values is present - f = formatdict.get(key,None) - if f == None: - self._add_definition(formatdict, key, value, line ) - f = formatdict[key] - if f.type == "Flag": - if value is not None: self.error(line,self.ERROR_FLAG_HAS_VALUE) - return [] - values = value.split(',') - # deal with trailing data in some early VCF files - if f.type in ["Float","Integer"] and len(values)>0 and values[-1].find(';') > -1: - self.error(line,self.ERROR_TRAILING_DATA,values[-1]) - values[-1] = values[-1].split(';')[0] - if f.type == "Integer": - for idx,v in enumerate(values): - try: - if v == ".": values[idx] = f.missingvalue - else: values[idx] = int(v) - except: - self.error(line,self.ERROR_FORMAT_NOT_NUMERICAL,values) - return [0] * len(values) - return values - elif f.type == "String": - self._line = line - if f.id == "GT": values = map( self.convertGT, values ) - return values - elif f.type == "Character": - for v in values: - if len(v) != 1: self.error(line,self.ERROR_FORMAT_NOT_CHAR) - return values - elif f.type == "Float": - for idx,v in enumerate(values): - if v == ".": values[idx] = f.missingvalue - try: return map(float,values) - except: - self.error(line,self.ERROR_FORMAT_NOT_NUMERICAL,values) - return [0.0] * len(values) - else: - # can't happen - self.error(line,self.ERROR_INFO_STRING) - - - def inregion(self, chrom, pos): - if not self._regions: return True - for r in self._regions: - if r[0] == chrom and r[1] <= pos < r[2]: return True - return False - - - def parse_data( self, line, lineparse=False ): - cols = line.split('\t') - if len(cols) != len(self._samples)+9: - # gracefully deal with absent FORMAT column - if len(cols) == 8 and len(self._samples)==0: - cols.append("") - else: - self.error(line, - self.BAD_NUMBER_OF_COLUMNS, - "expected %s for %s samples (%s), got %s" % (len(self._samples)+9, len(self._samples), self._samples, len(cols))) - - chrom = cols[0] - - # get 0-based position - try: pos = int(cols[1])-1 - except: self.error(line,self.POS_NOT_NUMERICAL) - if pos < 0: self.error(line,self.POS_NOT_POSITIVE) - - # implement filtering - if not self.inregion(chrom,pos): return None - - # end of first-pass parse for sortedVCF - if lineparse: return chrom, pos, line - - id = cols[2] - - ref = cols[3].upper() - if ref == ".": - self.error(line,self.MISSING_REF) - if self._version == 33: ref = get_sequence(chrom,pos,pos+1,self._reference) - else: ref = "" - else: - for c in ref: - if c not in "ACGTN": self.error(line,self.UNKNOWN_CHAR_IN_REF) - if "N" in ref: ref = get_sequence(chrom,pos,pos+len(ref),self._reference) - - # make sure reference is sane - if self._reference: - left = max(0,pos-100) - faref_leftflank = get_sequence(chrom,left,pos+len(ref),self._reference) - faref = faref_leftflank[pos-left:] - if faref != ref: self.error(line,self.WRONG_REF,"(reference is %s, VCF says %s)" % (faref,ref)) - ref = faref - - # convert v3.3 to v4.0 alleles below - if cols[4] == ".": alt = [] - else: alt = cols[4].upper().split(',') - - if cols[5] == ".": qual = -1 - else: - try: qual = float(cols[5]) - except: self.error(line,self.QUAL_NOT_NUMERICAL) - - # postpone checking that filters exist. Encode missing filter or no filtering as empty list - if cols[6] == "." or cols[6] == "PASS" or cols[6] == "0": filter = [] - else: filter = cols[6].split(';') - - # dictionary of keys, and list of values - info = {} - if cols[7] != ".": - for blurp in cols[7].split(';'): - elts = blurp.split('=') - if len(elts) == 1: v = None - elif len(elts) == 2: v = elts[1] - else: self.error(line,self.ERROR_INFO_STRING) - info[elts[0]] = self.parse_formatdata(elts[0], v, self._info, line) - - # Gracefully deal with absent FORMAT column - if cols[8] == "": format = [] - else: format = cols[8].split(':') - - # check: all filters are defined - for f in filter: - if f not in self._filter: self.error(line,self.FILTER_NOT_DEFINED, f) - - # check: format fields are defined - for f in format: - if f not in self._format: self.error(line,self.FORMAT_NOT_DEFINED, f) - - # convert v3.3 alleles - if self._version == 33: - if len(ref) != 1: self.error(line,self.V33_BAD_REF) - newalts = [] - have_deletions = False - for a in alt: - if len(a) == 1: a = a + ref[1:] # SNP; add trailing reference - elif a.startswith('I'): a = ref[0] + a[1:] + ref[1:] # insertion just beyond pos; add first and trailing reference - elif a.startswith('D'): # allow D and D - have_deletions = True - try: - l = int(a[1:]) # throws ValueError if sequence - if len(ref) < l: # add to reference if necessary - addns = get_sequence(chrom,pos+len(ref),pos+l,self._reference) - ref += addns - for i,na in enumerate(newalts): newalts[i] = na+addns - a = ref[l:] # new deletion, deleting pos...pos+l - except ValueError: - s = a[1:] - if len(ref) < len(s): # add Ns to reference if necessary - addns = get_sequence(chrom,pos+len(ref),pos+len(s),self._reference) - if not s.endswith(addns) and addns != 'N'*len(addns): - self.error(line,self.V33_UNMATCHED_DELETION, - "(deletion is %s, reference is %s)" % (a,get_sequence(chrom,pos,pos+len(s),self._reference))) - ref += addns - for i,na in enumerate(newalts): newalts[i] = na+addns - a = ref[len(s):] # new deletion, deleting from pos - else: - self.error(line,self.V33_BAD_ALLELE) - newalts.append(a) - alt = newalts - # deletion alleles exist, add dummy 1st reference allele, and account for leading base - if have_deletions: - if pos == 0: - # Petr Danacek's: we can't have a leading nucleotide at (1-based) position 1 - addn = get_sequence(chrom,pos+len(ref),pos+len(ref)+1,self._reference) - ref += addn - alt = [allele+addn for allele in alt] - else: - addn = get_sequence(chrom,pos-1,pos,self._reference) - ref = addn + ref - alt = [addn + allele for allele in alt] - pos -= 1 - else: - # format v4.0 -- just check for nucleotides - for allele in alt: - if not alleleRegEx.match(allele): - self.error(line,self.V40_BAD_ALLELE,allele) - - # check for leading nucleotide in indel calls - for allele in alt: - if len(allele) != len(ref): - if len(allele) == 0: self.error(line,self.ZERO_LENGTH_ALLELE) - if ref[0].upper() != allele[0].upper() and "N" not in (ref[0]+allele[0]).upper(): - self.error(line,self.MISSING_INDEL_ALLELE_REF_BASE) - - # trim trailing bases in alleles - for i in range(1,min(len(ref),min(map(len,alt)))): - if len(set(allele[-1].upper() for allele in alt)) > 1 or ref[-1].upper() != alt[0][-1].upper(): - break - ref, alt = ref[:-1], [allele[:-1] for allele in alt] - - # left-align alleles, if a reference is available - if self._leftalign and self._reference: - while left < pos: - movable = True - for allele in alt: - if len(allele) > len(ref): - longest, shortest = allele, ref - else: - longest, shortest = ref, allele - if len(longest) == len(shortest) or longest[:len(shortest)].upper() != shortest.upper(): - movable = False - if longest[-1].upper() != longest[len(shortest)-1].upper(): - movable = False - if not movable: - break - ref = ref[:-1] - alt = [allele[:-1] for allele in alt] - if min(len(allele) for allele in alt) == 0 or len(ref) == 0: - ref = faref_leftflank[pos-left-1] + ref - alt = [faref_leftflank[pos-left-1] + allele for allele in alt] - pos -= 1 - - # parse sample columns - samples = [] - for sample in cols[9:]: - dict = {} - values = sample.split(':') - if len(values) > len(format): - self.error(line,self.BAD_NUMBER_OF_VALUES,"(found %s values in element %s; expected %s)" % (len(values),sample,len(format))) - for idx in range(len(format)): - expected = self.get_expected(format[idx], self._format, alt) - if idx < len(values): value = values[idx] - else: - if expected == -1: value = "." - else: value = ",".join(["."]*expected) - dict[format[idx]] = self.parse_formatdata(format[idx], value, self._format, line) - if expected != -1 and len(dict[format[idx]]) != expected: - self.error(line,self.BAD_NUMBER_OF_PARAMETERS, - "id=%s, expected %s parameters, got %s" % (format[idx],expected,dict[format[idx]])) - if len(dict[format[idx]] ) < expected: dict[format[idx]] += [dict[format[idx]][-1]]*(expected-len(dict[format[idx]])) - dict[format[idx]] = dict[format[idx]][:expected] - samples.append( dict ) - - # done - d = {'chrom':chrom, - 'pos':pos, # return 0-based position - 'id':id, - 'ref':ref, - 'alt':alt, - 'qual':qual, - 'filter':filter, - 'info':info, - 'format':format} - for key,value in zip(self._samples,samples): - d[key] = value - - return d - - - def write_data(self, stream, data): - required = ['chrom','pos','id','ref','alt','qual','filter','info','format'] + self._samples - for k in required: - if k not in data: raise ValueError("Required key %s not found in data" % str(k)) - if data['alt'] == []: alt = "." - else: alt = ",".join(data['alt']) - if data['filter'] == None: filter = "." - elif data['filter'] == []: - if self._version == 33: filter = "0" - else: filter = "PASS" - else: filter = ';'.join(data['filter']) - if data['qual'] == -1: qual = "." - else: qual = str(data['qual']) - - output = [data['chrom'], - str(data['pos']+1), # change to 1-based position - data['id'], - data['ref'], - alt, - qual, - filter, - self.format_formatdata( data['info'], self._info, separator=";" ), - self.format_formatdata( data['format'], self._format, value=False ) ] - - for s in self._samples: - output.append( self.format_formatdata( data[s], self._format, key=False ) ) - - stream.write( "\t".join(output) + "\n" ) - - def _parse_header(self, stream): - self._lineno = 0 - for line in stream: - self._lineno += 1 - if line.startswith('##'): - self.parse_header( line.strip() ) - elif line.startswith('#'): - self.parse_heading( line.strip() ) - self.enter_default_format() - else: - break - return line - - def _parse(self, line, stream): - if len(line.strip()) > 0: - d = self.parse_data( line.strip() ) - if d: yield d - for line in stream: - self._lineno += 1 - if self._lines and self._lineno > self._lines: raise StopIteration - d = self.parse_data( line.strip() ) - if d: yield d - - ###################################################################################################### - # - # API follows - # - ###################################################################################################### - - def getsamples(self): - """ List of samples in VCF file """ - return self._samples - - def setsamples(self,samples): - """ List of samples in VCF file """ - self._samples = samples - - def getheader(self): - """ List of header key-value pairs (strings) """ - return self._header - - def setheader(self,header): - """ List of header key-value pairs (strings) """ - self._header = header - - def getinfo(self): - """ Dictionary of ##INFO tags, as VCF.FORMAT values """ - return self._info - - def setinfo(self,info): - """ Dictionary of ##INFO tags, as VCF.FORMAT values """ - self._info = info - - def getformat(self): - """ Dictionary of ##FORMAT tags, as VCF.FORMAT values """ - return self._format - - def setformat(self,format): - """ Dictionary of ##FORMAT tags, as VCF.FORMAT values """ - self._format = format - - def getfilter(self): - """ Dictionary of ##FILTER tags, as VCF.FORMAT values """ - return self._filter - - def setfilter(self,filter): - """ Dictionary of ##FILTER tags, as VCF.FORMAT values """ - self._filter = filter - - def setversion(self, version): - if version not in [33,40,41]: raise ValueError("Can only handle v3.3, v4.0 and v4.1 VCF files") - self._version = version - - def setregions(self, regions): - self._regions = regions - - def setreference(self, ref): - """ Provide a reference sequence; a Python class supporting a fetch(chromosome, start, end) method, e.g. PySam.FastaFile """ - self._reference = ref - - def ignoreerror(self, errorstring): - try: self._ignored_errors.add(self.__dict__[errorstring]) - except KeyError: raise ValueError("Invalid error string: %s" % errorstring) - - def warnerror(self, errorstring): - try: self._warn_errors.add(self.__dict__[errorstring]) - except KeyError: raise ValueError("Invalid error string: %s" % errorstring) - - def parse(self, stream): - """ Parse a stream of VCF-formatted lines. Initializes class instance and return generator """ - last_line = self._parse_header(stream) - # now return a generator that does the actual work. In this way the pre-processing is done - # before the first piece of data is yielded - return self._parse(last_line, stream) - - def write(self, stream, datagenerator): - """ Writes a VCF file to a stream, using a data generator (or list) """ - self.write_header(stream) - self.write_heading(stream) - for data in datagenerator: self.write_data(stream,data) - - def writeheader(self, stream): - """ Writes a VCF header """ - self.write_header(stream) - self.write_heading(stream) - - def compare_calls(self, pos1, ref1, alt1, pos2, ref2, alt2): - """ Utility function: compares two calls for equality """ - # a variant should always be assigned to a unique position, one base before - # the leftmost position of the alignment gap. If this rule is implemented - # correctly, the two positions must be equal for the calls to be identical. - if pos1 != pos2: return False - # from both calls, trim rightmost bases when identical. Do this safely, i.e. - # only when the reference bases are not Ns - while len(ref1)>0 and len(alt1)>0 and ref1[-1] == alt1[-1]: - ref1 = ref1[:-1] - alt1 = alt1[:-1] - while len(ref2)>0 and len(alt2)>0 and ref2[-1] == alt2[-1]: - ref2 = ref2[:-1] - alt2 = alt2[:-1] - # now, the alternative alleles must be identical - return alt1 == alt2 - -########################################################################################################### -########################################################################################################### -## API functions added by Andreas -########################################################################################################### - - def connect( self, filename ): - '''connect to tabix file.''' - self.tabixfile = pysam.Tabixfile( filename ) - self._parse_header(self.tabixfile.header) - - def fetch(self, - reference = None, - start = None, - end = None, - region = None ): - """ Parse a stream of VCF-formatted lines. Initializes class instance and return generator """ - - iter = self.tabixfile.fetch( reference, start, end, region, parser = pysam.asVCF() ) - for x in iter: - yield VCFRecord( x, self ) - - def validate( self, record ): - '''validate vcf record. - - returns a validated record. - ''' - - chrom, pos = record.chrom, record.pos - - # check reference - ref = record.ref - if ref == ".": - self.error(str(record),self.MISSING_REF) - if self._version == 33: ref = get_sequence(chrom,pos,pos+1,self._reference) - else: ref = "" - else: - for c in ref: - if c not in "ACGTN": self.error(str(record),self.UNKNOWN_CHAR_IN_REF) - if "N" in ref: ref = get_sequence(chrom, - pos, - pos+len(ref), - self._reference) - - # make sure reference is sane - if self._reference: - left = max(0,self.pos-100) - faref_leftflank = get_sequence(chrom,left,self.pos+len(ref),self._reference) - faref = faref_leftflank[pos-left:] - if faref != ref: self.error(line,self.WRONG_REF,"(reference is %s, VCF says %s)" % (faref,ref)) - ref = faref - - # check: format fields are defined - for f in record.format: - if f not in self._format: self.error(str(record),self.FORMAT_NOT_DEFINED, f) - - # check: all filters are defined - for f in record.filter: - if f not in self._filter: self.error(str(record),self.FILTER_NOT_DEFINED, f) - - # convert v3.3 alleles - if self._version == 33: - if len(ref) != 1: self.error(line,self.V33_BAD_REF) - newalts = [] - have_deletions = False - for a in alt: - if len(a) == 1: a = a + ref[1:] # SNP; add trailing reference - elif a.startswith('I'): a = ref[0] + a[1:] + ref[1:] # insertion just beyond pos; add first and trailing reference - elif a.startswith('D'): # allow D and D - have_deletions = True - try: - l = int(a[1:]) # throws ValueError if sequence - if len(ref) < l: # add to reference if necessary - addns = get_sequence(chrom,pos+len(ref),pos+l,self._reference) - ref += addns - for i,na in enumerate(newalts): newalts[i] = na+addns - a = ref[l:] # new deletion, deleting pos...pos+l - except ValueError: - s = a[1:] - if len(ref) < len(s): # add Ns to reference if necessary - addns = get_sequence(chrom,pos+len(ref),pos+len(s),self._reference) - if not s.endswith(addns) and addns != 'N'*len(addns): - self.error(line,self.V33_UNMATCHED_DELETION, - "(deletion is %s, reference is %s)" % (a,get_sequence(chrom,pos,pos+len(s),self._reference))) - ref += addns - for i,na in enumerate(newalts): newalts[i] = na+addns - a = ref[len(s):] # new deletion, deleting from pos - else: - self.error(line,self.V33_BAD_ALLELE) - newalts.append(a) - alt = newalts - # deletion alleles exist, add dummy 1st reference allele, and account for leading base - if have_deletions: - if pos == 0: - # Petr Danacek's: we can't have a leading nucleotide at (1-based) position 1 - addn = get_sequence(chrom,pos+len(ref),pos+len(ref)+1,self._reference) - ref += addn - alt = [allele+addn for allele in alt] - else: - addn = get_sequence(chrom,pos-1,pos,self._reference) - ref = addn + ref - alt = [addn + allele for allele in alt] - pos -= 1 - else: - # format v4.0 -- just check for nucleotides - for allele in alt: - if not alleleRegEx.match(allele): - self.error(line,self.V40_BAD_ALLELE,allele) - - - # check for leading nucleotide in indel calls - for allele in alt: - if len(allele) != len(ref): - if len(allele) == 0: self.error(line,self.ZERO_LENGTH_ALLELE) - if ref[0].upper() != allele[0].upper() and "N" not in (ref[0]+allele[0]).upper(): - self.error(line,self.MISSING_INDEL_ALLELE_REF_BASE) - - # trim trailing bases in alleles - for i in range(1,min(len(ref),min(map(len,alt)))): - if len(set(allele[-1].upper() for allele in alt)) > 1 or ref[-1].upper() != alt[0][-1].upper(): - break - ref, alt = ref[:-1], [allele[:-1] for allele in alt] - - # left-align alleles, if a reference is available - if self._leftalign and self._reference: - while left < pos: - movable = True - for allele in alt: - if len(allele) > len(ref): - longest, shortest = allele, ref - else: - longest, shortest = ref, allele - if len(longest) == len(shortest) or longest[:len(shortest)].upper() != shortest.upper(): - movable = False - if longest[-1].upper() != longest[len(shortest)-1].upper(): - movable = False - if not movable: - break - ref = ref[:-1] - alt = [allele[:-1] for allele in alt] - if min(len(allele) for allele in alt) == 0 or len(ref) == 0: - ref = faref_leftflank[pos-left-1] + ref - alt = [faref_leftflank[pos-left-1] + allele for allele in alt] - pos -= 1 - - - - diff --git a/pysam/alternatives.py.obsolete b/pysam/alternatives.py.obsolete deleted file mode 100644 index fd76802..0000000 --- a/pysam/alternatives.py.obsolete +++ /dev/null @@ -1,82 +0,0 @@ -# This file contains an alternative implementation -# to call samtools functions. These are direct calls. -# Plus: less overhead -# Minus: more trouble in maintaining - -#in csamtools.pxd - # samtools toolkit functions - ctypedef int (*pysam_samtools_f)(int argc, char *argv[]) - - int bam_taf2baf(int argc, char *argv[]) - int bam_pileup(int argc, char *argv[]) - int bam_merge(int argc, char *argv[]) - int bam_index(int argc, char *argv[]) - int bam_sort(int argc, char *argv[]) - int bam_tview_main(int argc, char *argv[]) - int bam_mating(int argc, char *argv[]) - int bam_rmdup(int argc, char *argv[]) - int bam_rmdupse(int argc, char *argv[]) - int bam_flagstat(int argc, char *argv[]) - int bam_fillmd(int argc, char *argv[]) - int main_samview(int argc, char *argv[]) - int main_import(int argc, char *argv[]) - int faidx_main(int argc, char *argv[]) - int glf3_view_main(int argc, char *argv[]) - - -## Alternative code in csamtools.pyx -cdef class SamtoolsWrapper: - '''generic wrapper around samtools functions''' - cdef pysam_samtools_f f - - def __init__(self): self.f = NULL - - def call(self, *args ): - - if self.f == NULL: raise NotImplementedError("invalid call to base class" ) - - cdef char ** cargs - cdef int i, n, retval - n = len(args) - # allocate one more for first (dummy) argument (contains command) - cargs = calloc( n+1, sizeof( char *) ) - cargs[0] = "method" - for i from 0 <= i < n: - cargs[i+1] = args[i] - for i from 0 <= i < n+1: - print cargs[i] - retval = self.f(n+1, cargs) - free( cargs ) - return retval - -cdef class SamtoolsWrapperImport( SamtoolsWrapper ): - def __init__(self): self.f = main_import -cdef class SamtoolsWrapperPileup( SamtoolsWrapper ): - def __init__(self): self.f = bam_pileup -cdef class SamtoolsWrapperMerge( SamtoolsWrapper ): - def __init__(self): self.f = bam_merge -cdef class SamtoolsWrapperSort( SamtoolsWrapper ): - def __init__(self): self.f = bam_sort -cdef class SamtoolsWrapperIndex( SamtoolsWrapper ): - def __init__(self): self.f = bam_index -cdef class SamtoolsWrapperFaidx( SamtoolsWrapper ): - def __init__(self): self.f = faidx_main -cdef class SamtoolsWrapperFixMate( SamtoolsWrapper ): - def __init__(self): self.f = bam_mating -cdef class SamtoolsWrapperRmDup( SamtoolsWrapper ): - def __init__(self): self.f = bam_rmdup -cdef class SamtoolsWrapperRmDupSe( SamtoolsWrapper ): - def __init__(self): self.f = bam_rmdupse -cdef class SamtoolsWrapperGlf3Viwen( SamtoolsWrapper ): - def __init__(self): self.f = glf3_view_main -cdef class SamtoolsWrapperFlagStat( SamtoolsWrapper ): - def __init__(self): self.f = bam_flagstat -cdef class SamtoolsWrapperFillMd( SamtoolsWrapper ): - def __init__(self): self.f = bam_fillmd -cdef class SamtoolsWrapperCalMd( SamtoolsWrapper ): - def __init__(self): self.f = bam_fillmd - -automatic creation of these functions does not work -due to pyrex/cython - -def sort( *args, **kwargs ): return SamtoolsWrapperSort().call(*args, **kwargs) diff --git a/pysam/libcalignedsegment.pxd b/pysam/libcalignedsegment.pxd index 265d146..f6f707c 100644 --- a/pysam/libcalignedsegment.pxd +++ b/pysam/libcalignedsegment.pxd @@ -1,3 +1,5 @@ +# cython: language_level=3, embedsignature=True, profile=True + from pysam.libchtslib cimport * cdef extern from "htslib_util.h": diff --git a/pysam/libcalignedsegment.pyx b/pysam/libcalignedsegment.pyx index e93074a..9c16bed 100644 --- a/pysam/libcalignedsegment.pyx +++ b/pysam/libcalignedsegment.pyx @@ -1,6 +1,4 @@ -# cython: embedsignature=True -# cython: profile=True -############################################################################### +# cython: language_level=3, embedsignature=True, profile=True ############################################################################### # Cython wrapper for SAM/BAM/CRAM files based on htslib ############################################################################### @@ -113,11 +111,11 @@ cdef inline uint8_t toupper(uint8_t ch): cdef inline uint8_t strand_mark_char(uint8_t ch, bam1_t *b): - if ch == '=': + if ch == b'=': if bam_is_rev(b): - return ',' + return b',' else: - return '.' + return b'.' else: if bam_is_rev(b): return tolower(ch) @@ -138,8 +136,9 @@ cdef inline bint pileup_base_qual_skip(bam_pileup1_t * p, uint32_t threshold): cdef inline char map_typecode_htslib_to_python(uint8_t s): """map an htslib typecode to the corresponding python typecode - to be used in the struct or array modules.""" + to be used in the struct or array modules. + """ # map type from htslib to python array cdef char * f = strchr(htslib_types, s) @@ -149,7 +148,7 @@ cdef inline char map_typecode_htslib_to_python(uint8_t s): cdef inline uint8_t map_typecode_python_to_htslib(char s): - """determine value type from type code of array""" + """determine value type from type code of array.""" cdef char * f = strchr(parray_types, s) if f == NULL: return 0 @@ -181,7 +180,9 @@ cdef inline void update_bin(bam1_t * src): # optional tag data manipulation cdef convert_binary_tag(uint8_t * tag): """return bytesize, number of values and array of values - in aux_data memory location pointed to by tag.""" + in aux_data memory location pointed to by tag. + + """ cdef uint8_t auxtype cdef uint8_t byte_size cdef int32_t nvalues @@ -208,7 +209,9 @@ cdef convert_binary_tag(uint8_t * tag): cdef inline uint8_t get_tag_typecode(value, value_type=None): - """guess type code for a *value*. If *value_type* is None, the type + """guess type code for a *value*. + + If *value_type* is None, the type code will be inferred based on the Python type of *value* """ @@ -219,29 +222,27 @@ cdef inline uint8_t get_tag_typecode(value, value_type=None): if isinstance(value, int): if value < 0: if value >= INT8_MIN: - typecode = 'c' + typecode = b'c' elif value >= INT16_MIN: - typecode = 's' + typecode = b's' elif value >= INT32_MIN: - typecode = 'i' + typecode = b'i' # unsigned ints else: if value <= UINT8_MAX: - typecode = 'C' + typecode = b'C' elif value <= UINT16_MAX: - typecode = 'S' + typecode = b'S' elif value <= UINT32_MAX: - typecode = 'I' + typecode = b'I' elif isinstance(value, float): - typecode = 'f' + typecode = b'f' elif isinstance(value, str): - typecode = 'Z' + typecode = b'Z' elif isinstance(value, bytes): - typecode = 'Z' - elif isinstance(value, array.array) or \ - isinstance(value, list) or \ - isinstance(value, tuple): - typecode = 'B' + typecode = b'Z' + elif isinstance(value, array.array) or isinstance(value, list) or isinstance(value, tuple): + typecode = b'B' else: if value_type in 'aAsSIcCZidfH': typecode = force_bytes(value_type)[0] @@ -250,7 +251,7 @@ cdef inline uint8_t get_tag_typecode(value, value_type=None): cdef inline uint8_t get_btag_typecode(value, min_value=None, max_value=None): - '''returns the value typecode of a value. + """returns the value typecode of a value. If max is specified, the appropriate type is returned for a range where value is the minimum. @@ -258,15 +259,14 @@ cdef inline uint8_t get_btag_typecode(value, min_value=None, max_value=None): Note that this method returns types from the extended BAM alphabet of types that includes tags that are not part of the SAM specification. - ''' - + """ cdef uint8_t typecode t = type(value) if t is float: - typecode = 'f' + typecode = b'f' elif t is int: if max_value is None: max_value = value @@ -275,11 +275,11 @@ cdef inline uint8_t get_btag_typecode(value, min_value=None, max_value=None): # signed ints if min_value < 0: if min_value >= INT8_MIN and max_value <= INT8_MAX: - typecode = 'c' + typecode = b'c' elif min_value >= INT16_MIN and max_value <= INT16_MAX: - typecode = 's' + typecode = b's' elif min_value >= INT32_MIN or max_value <= INT32_MAX: - typecode = 'i' + typecode = b'i' else: raise ValueError( "at least one signed integer out of range of " @@ -287,22 +287,22 @@ cdef inline uint8_t get_btag_typecode(value, min_value=None, max_value=None): # unsigned ints else: if max_value <= UINT8_MAX: - typecode = 'C' + typecode = b'C' elif max_value <= UINT16_MAX: - typecode = 'S' + typecode = b'S' elif max_value <= UINT32_MAX: - typecode = 'I' + typecode = b'I' else: raise ValueError( "at least one integer out of range of BAM/SAM specification") else: # Note: hex strings (H) are not supported yet if t is not bytes: - value = value.encode('ascii') + value = value.encode('utf-8') if len(value) == 1: - typecode = 'A' + typecode = b'A' else: - typecode = 'Z' + typecode = b'Z' return typecode @@ -329,6 +329,7 @@ cdef inline pack_tags(tags): Returns a format string and the associated list of arguments to be used in a call to struct.pack_into. + """ fmts, args = ["<"], [] @@ -398,16 +399,16 @@ cdef inline pack_tags(tags): if typecode == 0: raise ValueError("could not deduce typecode for value {}".format(value)) - if typecode == 'a' or typecode == 'A' or typecode == 'Z' or typecode == 'H': + if typecode == b'a' or typecode == b'A' or typecode == b'Z' or typecode == b'H': value = force_bytes(value) - if typecode == "a": - typecode = 'A' + if typecode == b'a': + typecode = b'A' - if typecode == 'Z' or typecode == 'H': - datafmt = "2sB%is" % (len(value)+1) + if typecode == b'Z' or typecode == b'H': + datafmt = '2sB%is' % (len(value)+1) else: - datafmt = "2sB%s" % DATATYPE2FORMAT[typecode][0] + datafmt = '2sB%s' % DATATYPE2FORMAT[typecode][0] args.extend([pytag[:2], typecode, @@ -424,8 +425,8 @@ cdef inline int32_t calculateQueryLengthWithoutHardClipping(bam1_t * src): Length ignores hard-clipped bases. Return 0 if there is no CIGAR alignment. - """ + """ cdef uint32_t * cigar_p = pysam_bam_get_cigar(src) if cigar_p == NULL: @@ -454,8 +455,8 @@ cdef inline int32_t calculateQueryLengthWithHardClipping(bam1_t * src): Length includes hard-clipped bases. Return 0 if there is no CIGAR alignment. - """ + """ cdef uint32_t * cigar_p = pysam_bam_get_cigar(src) if cigar_p == NULL: @@ -531,9 +532,7 @@ cdef inline int32_t getQueryEnd(bam1_t *src) except -1: cdef inline bytes getSequenceInRange(bam1_t *src, uint32_t start, uint32_t end): - """return python string of the sequence in a bam1_t object. - """ - + """return python string of the sequence in a bam1_t object.""" cdef uint8_t * p cdef uint32_t k cdef char * s @@ -548,7 +547,7 @@ cdef inline bytes getSequenceInRange(bam1_t *src, for k from start <= k < end: # equivalent to seq_nt16_str[bam1_seqi(s, i)] (see bam.c) # note: do not use string literal as it will be a python string - s[k-start] = seq_nt16_str[p[k/2] >> 4 * (1 - k%2) & 0xf] + s[k-start] = seq_nt16_str[p[k//2] >> 4 * (1 - k%2) & 0xf] return charptr_to_bytes(seq) @@ -556,8 +555,7 @@ cdef inline bytes getSequenceInRange(bam1_t *src, cdef inline object getQualitiesInRange(bam1_t *src, uint32_t start, uint32_t end): - """return python array of quality values from a bam1_t object""" - + """return python array of quality values from a bam1_t object.""" cdef uint8_t * p cdef uint32_t k @@ -580,7 +578,7 @@ cdef inline object getQualitiesInRange(bam1_t *src, cdef class AlignedSegment cdef AlignedSegment makeAlignedSegment(bam1_t *src, AlignmentHeader header): - '''return an AlignedSegment object constructed from `src`''' + """return an AlignedSegment object constructed from `src`.""" # note that the following does not call __init__ cdef AlignedSegment dest = AlignedSegment.__new__(AlignedSegment) dest._delegate = bam_dup1(src) @@ -596,10 +594,12 @@ cdef PileupColumn makePileupColumn(bam_pileup1_t ** plp, uint32_t min_base_quality, char * reference_sequence, AlignmentHeader header): - '''return a PileupColumn object constructed from pileup in `plp` and - setting additional attributes. + """return a PileupColumn object. + + It is constructed from pileup in `plp` and setting additional + attributes. - ''' + """ # note that the following does not call __init__ cdef PileupColumn dest = PileupColumn.__new__(PileupColumn) dest.header = header @@ -618,7 +618,7 @@ cdef PileupColumn makePileupColumn(bam_pileup1_t ** plp, cdef class PileupRead cdef PileupRead makePileupRead(bam_pileup1_t *src, AlignmentHeader header): - '''return a PileupRead object construted from a bam_pileup1_t * object.''' + """return a PileupRead object construted from a bam_pileup1_t * object.""" # note that the following does not call __init__ cdef PileupRead dest = PileupRead.__new__(PileupRead) dest._alignment = makeAlignedSegment(src.b, header) @@ -664,7 +664,7 @@ cdef inline uint32_t get_md_reference_length(char * md_tag): else: l += nmatches nmatches = 0 - if md_tag[md_idx] == '^': + if md_tag[md_idx] == b'^': md_idx += 1 while md_tag[md_idx] >= 65 and md_tag[md_idx] <= 90: md_idx += 1 @@ -739,7 +739,7 @@ cdef inline bytes build_alignment_sequence(bam1_t * src): s_idx += 1 elif op == BAM_CDEL: for i from 0 <= i < l: - s[s_idx] = '-' + s[s_idx] = b'-' s_idx += 1 elif op == BAM_CREF_SKIP: pass @@ -769,7 +769,7 @@ cdef inline bytes build_alignment_sequence(bam1_t * src): cdef int insertions = 0 while s[s_idx] != 0: - if s[s_idx] >= 'a': + if s[s_idx] >= b'a': insertions += 1 s_idx += 1 s_idx = 0 @@ -790,15 +790,15 @@ cdef inline bytes build_alignment_sequence(bam1_t * src): else: # save matches up to this point, skipping insertions for x from 0 <= x < nmatches: - while s[s_idx] >= 'a': + while s[s_idx] >= b'a': s_idx += 1 s_idx += 1 - while s[s_idx] >= 'a': + while s[s_idx] >= b'a': s_idx += 1 r_idx += nmatches nmatches = 0 - if md_tag[md_idx] == '^': + if md_tag[md_idx] == b'^': md_idx += 1 while md_tag[md_idx] >= 65 and md_tag[md_idx] <= 90: # assert s[s_idx] == '-' @@ -818,10 +818,10 @@ cdef inline bytes build_alignment_sequence(bam1_t * src): # save matches up to this point, skipping insertions for x from 0 <= x < nmatches: - while s[s_idx] >= 'a': + while s[s_idx] >= b'a': s_idx += 1 s_idx += 1 - while s[s_idx] >= 'a': + while s[s_idx] >= b'a': s_idx += 1 seq = PyBytes_FromStringAndSize(s, s_idx) @@ -885,7 +885,7 @@ cdef inline bytes build_reference_sequence(bam1_t * src): cdef class AlignedSegment: - '''Class representing an aligned segment. + """Class representing an aligned segment. This class stores a handle to the samtools C-structure representing an aligned read. Member read access is forwarded to the C-structure @@ -909,8 +909,8 @@ cdef class AlignedSegment: :class:`~pysam.AlignmentHeader` object to map numerical identifiers to chromosome names. If not given, an empty header is created. - ''' + """ # Now only called when instances are created from Python def __init__(self, AlignmentHeader header=None): # see bam_init1 @@ -954,6 +954,7 @@ cdef class AlignedSegment: Similarly, the tags field is returned in its parsed state. To get a valid SAM record, use :meth:`to_string`. + """ # sam-parsing is done in sam.c/bam_format1_core which # requires a valid header. @@ -977,9 +978,11 @@ cdef class AlignedSegment: return makeAlignedSegment(self._delegate, self.header) def compare(self, AlignedSegment other): - '''return -1,0,1, if contents in this are binary - <,=,> to *other* - ''' + """return -1,0,1, if contents in this are binary <,=,> to *other*.""" + + # avoid segfault when other equals None + if other is None: + return -1 cdef int retval, x cdef bam1_t *t @@ -992,10 +995,12 @@ cdef class AlignedSegment: # cdef unsigned char * oo, * tt # tt = (&t.core) # oo = (&o.core) - # for x from 0 <= x < sizeof( bam1_core_t): print x, tt[x], oo[x] + # for x from 0 <= x < sizeof( bam1_core_t): + # print x, tt[x], oo[x] # tt = (t.data) # oo = (o.data) - # for x from 0 <= x < max(t.l_data, o.l_data): print x, tt[x], oo[x], chr(tt[x]), chr(oo[x]) + # for x from 0 <= x < max(t.l_data, o.l_data): + # print x, tt[x], oo[x], chr(tt[x]), chr(oo[x]) # Fast-path test for object identity if t == o: @@ -1042,6 +1047,7 @@ cdef class AlignedSegment: The output format is valid SAM format if a header is associated with the AlignedSegment. + """ cdef kstring_t line line.l = line.m = 0 @@ -1117,90 +1123,95 @@ cdef class AlignedSegment: ######################################################## ## Basic attributes in order of appearance in SAM format - property query_name: - """the query template name (None if not present)""" - def __get__(self): + @property + def query_name(self): + """the query template name or None if not present.""" + cdef bam1_t * src = self._delegate + if src.core.l_qname == 0: + return None - cdef bam1_t * src = self._delegate - if src.core.l_qname == 0: - return None + return charptr_to_str(pysam_bam_get_qname(src)) - return charptr_to_str(pysam_bam_get_qname(src)) + @query_name.setter + def query_name(self, qname): - def __set__(self, qname): + if qname is None or len(qname) == 0: + return - if qname is None or len(qname) == 0: - return + # See issue #447 + # (The threshold is 252 chars, but this includes a \0 byte. + if len(qname) > 251: + raise ValueError("query length out of range {} > 251".format( + len(qname))) - # See issue #447 - # (The threshold is 252 chars, but this includes a \0 byte. - if len(qname) > 251: - raise ValueError("query length out of range {} > 251".format( - len(qname))) + qname = force_bytes(qname) + cdef bam1_t * src = self._delegate + # the qname is \0 terminated + cdef uint8_t l = len(qname) + 1 - qname = force_bytes(qname) - cdef bam1_t * src = self._delegate - # the qname is \0 terminated - cdef uint8_t l = len(qname) + 1 + cdef char * p = pysam_bam_get_qname(src) + cdef uint8_t l_extranul = 0 - cdef char * p = pysam_bam_get_qname(src) - cdef uint8_t l_extranul = 0 + if l % 4 != 0: + l_extranul = 4 - l % 4 - if l % 4 != 0: - l_extranul = 4 - l % 4 + cdef bam1_t * retval = pysam_bam_update(src, + src.core.l_qname, + l + l_extranul, + p) + if retval == NULL: + raise MemoryError("could not allocate memory") - cdef bam1_t * retval = pysam_bam_update(src, - src.core.l_qname, - l + l_extranul, - p) - if retval == NULL: - raise MemoryError("could not allocate memory") + src.core.l_extranul = l_extranul + src.core.l_qname = l + l_extranul - src.core.l_extranul = l_extranul - src.core.l_qname = l + l_extranul + # re-acquire pointer to location in memory + # as it might have moved + p = pysam_bam_get_qname(src) - # re-acquire pointer to location in memory - # as it might have moved - p = pysam_bam_get_qname(src) + strncpy(p, qname, l) + # x might be > 255 + cdef uint16_t x = 0 - strncpy(p, qname, l) - # x might be > 255 - cdef uint16_t x = 0 + for x from l <= x < l + l_extranul: + p[x] = b'\0' - for x from l <= x < l + l_extranul: - p[x] = '\0' + @property + def flag(self): + """properties flag.""" + return self._delegate.core.flag - property flag: - """properties flag""" - def __get__(self): - return self._delegate.core.flag - def __set__(self, flag): - self._delegate.core.flag = flag + @flag.setter + def flag(self, flag): + self._delegate.core.flag = flag - property reference_name: + @property + def reference_name(self): """:term:`reference` name""" - def __get__(self): - if self._delegate.core.tid == -1: - return None - if self.header: - return self.header.get_reference_name(self._delegate.core.tid) - else: - raise ValueError("reference_name unknown if no header associated with record") - def __set__(self, reference): - cdef int tid - if reference is None or reference == "*": - self._delegate.core.tid = -1 - elif self.header: - tid = self.header.get_tid(reference) - if tid < 0: - raise ValueError("reference {} does not exist in header".format( - reference)) - self._delegate.core.tid = tid - else: - raise ValueError("reference_name can not be set if no header associated with record") + if self._delegate.core.tid == -1: + return None + if self.header: + return self.header.get_reference_name(self._delegate.core.tid) + else: + raise ValueError("reference_name unknown if no header associated with record") + + @reference_name.setter + def reference_name(self, reference): + cdef int tid + if reference is None or reference == "*": + self._delegate.core.tid = -1 + elif self.header: + tid = self.header.get_tid(reference) + if tid < 0: + raise ValueError("reference {} does not exist in header".format( + reference)) + self._delegate.core.tid = tid + else: + raise ValueError("reference_name can not be set if no header associated with record") - property reference_id: - """:term:`reference` ID + @property + def reference_id(self): + """:term:`reference` ID. .. note:: @@ -1209,34 +1220,40 @@ cdef class AlignedSegment: reference sequence, use :meth:`get_reference_name()` """ - def __get__(self): - return self._delegate.core.tid - def __set__(self, tid): - if tid != -1 and self.header and not self.header.is_valid_tid(tid): - raise ValueError("reference id {} does not exist in header".format( - tid)) - self._delegate.core.tid = tid + return self._delegate.core.tid + + @reference_id.setter + def reference_id(self, tid): + if tid != -1 and self.header and not self.header.is_valid_tid(tid): + raise ValueError("reference id {} does not exist in header".format( + tid)) + self._delegate.core.tid = tid + + @property + def reference_start(self): + """0-based leftmost reference coordinate.""" + return self._delegate.core.pos + + @reference_start.setter + def reference_start(self, pos): + ## setting the position requires updating the "bin" attribute + cdef bam1_t * src + src = self._delegate + src.core.pos = pos + update_bin(src) + + @property + def mapping_quality(self): + """mapping quality.""" + return pysam_get_qual(self._delegate) + + @mapping_quality.setter + def mapping_quality(self, qual): + pysam_set_qual(self._delegate, qual) - property reference_start: - """0-based leftmost coordinate""" - def __get__(self): - return self._delegate.core.pos - def __set__(self, pos): - ## setting the position requires updating the "bin" attribute - cdef bam1_t * src - src = self._delegate - src.core.pos = pos - update_bin(src) - - property mapping_quality: - """mapping quality""" - def __get__(self): - return pysam_get_qual(self._delegate) - def __set__(self, qual): - pysam_set_qual(self._delegate, qual) - - property cigarstring: - '''the :term:`cigar` alignment as a string. + @property + def cigarstring(self): + """the :term:`cigar` alignment as a string. The cigar string is a string of alternating integers and characters denoting the length and the type of @@ -1251,71 +1268,80 @@ cdef class AlignedSegment: To unset the cigarstring, assign None or the empty string. - ''' - def __get__(self): - c = self.cigartuples - if c is None: - return None - # reverse order - else: - return "".join([ "%i%c" % (y,CODE2CIGAR[x]) for x,y in c]) - def __set__(self, cigar): - if cigar is None or len(cigar) == 0: - self.cigartuples = [] - else: - parts = CIGAR_REGEX.findall(cigar) - # reverse order - self.cigartuples = [(CIGAR2CODE[ord(y)], int(x)) for x,y in parts] + """ + c = self.cigartuples + if c is None: + return None + + return "".join("%i%c" % (y, CODE2CIGAR[x]) for x, y in c) + + @cigarstring.setter + def cigarstring(self, cigar): + if not cigar: + self.cigartuples = [] + return + + parts = CIGAR_REGEX.findall(cigar) + self.cigartuples = [(CIGAR2CODE[ord(y)], int(x)) for x, y in parts] # TODO # property cigar: # """the cigar alignment""" - property next_reference_id: + @property + def next_reference_id(self): """the :term:`reference` id of the mate/next read.""" - def __get__(self): - return self._delegate.core.mtid - def __set__(self, mtid): - if mtid != -1 and self.header and not self.header.is_valid_tid(mtid): - raise ValueError("reference id {} does not exist in header".format( - mtid)) - self._delegate.core.mtid = mtid + return self._delegate.core.mtid - property next_reference_name: - """:term:`reference` name of the mate/next read (None if no - AlignmentFile is associated)""" - def __get__(self): - if self._delegate.core.mtid == -1: - return None - if self.header: - return self.header.get_reference_name(self._delegate.core.mtid) - else: - raise ValueError("next_reference_name unknown if no header associated with record") - - def __set__(self, reference): - cdef int mtid - if reference is None or reference == "*": - self._delegate.core.mtid = -1 - elif reference == "=": - self._delegate.core.mtid = self._delegate.core.tid - elif self.header: - mtid = self.header.get_tid(reference) - if mtid < 0: - raise ValueError("reference {} does not exist in header".format( - reference)) - self._delegate.core.mtid = mtid - else: - raise ValueError("next_reference_name can not be set if no header associated with record") + @next_reference_id.setter + def next_reference_id(self, mtid): + if mtid != -1 and self.header and not self.header.is_valid_tid(mtid): + raise ValueError("reference id {} does not exist in header".format( + mtid)) + self._delegate.core.mtid = mtid + + @property + def next_reference_name(self): + """:term:`reference` name of the mate/next read. + + Returns None if no AlignmentFile is associated) - property next_reference_start: + """ + if self._delegate.core.mtid == -1: + return None + if self.header: + return self.header.get_reference_name(self._delegate.core.mtid) + else: + raise ValueError("next_reference_name unknown if no header associated with record") + + @next_reference_name.setter + def next_reference_name(self, reference): + cdef int mtid + if reference is None or reference == "*": + self._delegate.core.mtid = -1 + elif reference == "=": + self._delegate.core.mtid = self._delegate.core.tid + elif self.header: + mtid = self.header.get_tid(reference) + if mtid < 0: + raise ValueError("reference {} does not exist in header".format( + reference)) + self._delegate.core.mtid = mtid + else: + raise ValueError("next_reference_name can not be set if no header associated with record") + + @property + def next_reference_start(self): """the position of the mate/next read.""" - def __get__(self): - return self._delegate.core.mpos - def __set__(self, mpos): - self._delegate.core.mpos = mpos + return self._delegate.core.mpos - property query_length: + @next_reference_start.setter + def next_reference_start(self, mpos): + self._delegate.core.mpos = mpos + + @property + def query_length(self): """the length of the query/read. This value corresponds to the length of the sequence supplied @@ -1333,19 +1359,22 @@ cdef class AlignedSegment: Returns 0 if not available. """ - def __get__(self): - return self._delegate.core.l_qseq + return self._delegate.core.l_qseq + + @property + def template_length(self): + """the observed query template length.""" + return self._delegate.core.isize - property template_length: - """the observed query template length""" - def __get__(self): - return self._delegate.core.isize - def __set__(self, isize): - self._delegate.core.isize = isize + @template_length.setter + def template_length(self, isize): + self._delegate.core.isize = isize - property query_sequence: - """read sequence bases, including :term:`soft clipped` bases - (None if not present). + @property + def query_sequence(self): + """read sequence bases, including :term:`soft clipped` bases. + + None is returned if not present. Note that assigning to seq will invalidate any quality scores. Thus, to in-place edit the sequence and quality scores, copies of @@ -1358,79 +1387,79 @@ cdef class AlignedSegment: The sequence is returned as it is stored in the BAM file. Some mappers might have stored a reverse complement of the original read sequence. + """ - def __get__(self): - if self.cache_query_sequence: - return self.cache_query_sequence + if self.cache_query_sequence: + return self.cache_query_sequence - cdef bam1_t * src - cdef char * s - src = self._delegate + cdef bam1_t * src + cdef char * s + src = self._delegate - if src.core.l_qseq == 0: - return None + if src.core.l_qseq == 0: + return None - self.cache_query_sequence = force_str(getSequenceInRange( - src, 0, src.core.l_qseq)) - return self.cache_query_sequence + self.cache_query_sequence = force_str(getSequenceInRange( + src, 0, src.core.l_qseq)) + return self.cache_query_sequence - def __set__(self, seq): - # samtools manages sequence and quality length memory together - # if no quality information is present, the first byte says 0xff. - cdef bam1_t * src - cdef uint8_t * p - cdef char * s - cdef int l, k - cdef Py_ssize_t nbytes_new, nbytes_old - - if seq == None: - l = 0 - else: - l = len(seq) - seq = force_bytes(seq) + @query_sequence.setter + def query_sequence(self, seq): + # samtools manages sequence and quality length memory together + # if no quality information is present, the first byte says 0xff. + cdef bam1_t * src + cdef bam1_t * retval + cdef uint8_t * p + cdef char * s + cdef int l, k + cdef Py_ssize_t nbytes_new, nbytes_old + + if seq == None: + l = 0 + else: + l = len(seq) + seq = force_bytes(seq) + + src = self._delegate + + # as the sequence is stored in half-bytes, the total length (sequence + # plus quality scores) is (l+1)//2 + l + nbytes_new = (l + 1) // 2 + l + nbytes_old = (src.core.l_qseq + 1) // 2 + src.core.l_qseq - src = self._delegate + # acquire pointer to location in memory + p = pysam_bam_get_seq(src) + src.core.l_qseq = l - # as the sequence is stored in half-bytes, the total length (sequence - # plus quality scores) is (l+1)/2 + l - nbytes_new = (l + 1) / 2 + l - nbytes_old = (src.core.l_qseq + 1) / 2 + src.core.l_qseq + # change length of data field + retval = pysam_bam_update(src, nbytes_old, nbytes_new, p) - # acquire pointer to location in memory + if retval == NULL: + raise MemoryError("could not allocate memory") + + if l > 0: + # re-acquire pointer to location in memory + # as it might have moved p = pysam_bam_get_seq(src) - src.core.l_qseq = l - - # change length of data field - cdef bam1_t * retval = pysam_bam_update(src, - nbytes_old, - nbytes_new, - p) - - if retval == NULL: - raise MemoryError("could not allocate memory") - - if l > 0: - # re-acquire pointer to location in memory - # as it might have moved - p = pysam_bam_get_seq(src) - for k from 0 <= k < nbytes_new: - p[k] = 0 - # convert to C string - s = seq - for k from 0 <= k < l: - p[k/2] |= seq_nt16_table[s[k]] << 4 * (1 - k % 2) - - # erase qualities - p = pysam_bam_get_qual(src) - p[0] = 0xff + for k from 0 <= k < nbytes_new: + p[k] = 0 + # convert to C string + s = seq + for k from 0 <= k < l: + p[k//2] |= seq_nt16_table[s[k]] << 4 * (1 - k % 2) + + # erase qualities + p = pysam_bam_get_qual(src) + p[0] = 0xff - self.cache_query_sequence = force_str(seq) + self.cache_query_sequence = force_str(seq) - # clear cached values for quality values - self.cache_query_qualities = None - self.cache_query_alignment_qualities = None + # clear cached values for quality values + self.cache_query_qualities = None + self.cache_query_alignment_qualities = None - property query_qualities: + @property + def query_qualities(self): """read sequence base qualities, including :term:`soft clipped` bases (None if not present). @@ -1447,64 +1476,64 @@ cdef class AlignedSegment: quality scores and the sequence are not the same. """ - def __get__(self): + if self.cache_query_qualities: + return self.cache_query_qualities - if self.cache_query_qualities: - return self.cache_query_qualities + cdef bam1_t * src + cdef char * q - cdef bam1_t * src - cdef char * q + src = self._delegate - src = self._delegate + if src.core.l_qseq == 0: + return None - if src.core.l_qseq == 0: - return None + self.cache_query_qualities = getQualitiesInRange(src, 0, src.core.l_qseq) + return self.cache_query_qualities - self.cache_query_qualities = getQualitiesInRange(src, 0, src.core.l_qseq) - return self.cache_query_qualities + @query_qualities.setter + def query_qualities(self, qual): + # note that memory is already allocated via setting the sequence + # hence length match of sequence and quality needs is checked. + cdef bam1_t * src + cdef uint8_t * p + cdef int l - def __set__(self, qual): + src = self._delegate + p = pysam_bam_get_qual(src) + if qual is None or len(qual) == 0: + # if absent and there is a sequence: set to 0xff + if src.core.l_qseq != 0: + p[0] = 0xff + return - # note that memory is already allocated via setting the sequence - # hence length match of sequence and quality needs is checked. - cdef bam1_t * src - cdef uint8_t * p - cdef int l + # check for length match + l = len(qual) + if src.core.l_qseq != l: + raise ValueError( + "quality and sequence mismatch: %i != %i" % + (l, src.core.l_qseq)) - src = self._delegate - p = pysam_bam_get_qual(src) - if qual is None or len(qual) == 0: - # if absent and there is a sequence: set to 0xff - if src.core.l_qseq != 0: - p[0] = 0xff - return - - # check for length match - l = len(qual) - if src.core.l_qseq != l: - raise ValueError( - "quality and sequence mismatch: %i != %i" % - (l, src.core.l_qseq)) + # create a python array object filling it + # with the quality scores - # create a python array object filling it - # with the quality scores + # NB: should avoid this copying if qual is + # already of the correct type. + cdef c_array.array result = c_array.array('B', qual) - # NB: should avoid this copying if qual is - # already of the correct type. - cdef c_array.array result = c_array.array('B', qual) + # copy data + memcpy(p, result.data.as_voidptr, l) - # copy data - memcpy(p, result.data.as_voidptr, l) + # save in cache + self.cache_query_qualities = qual - # save in cache - self.cache_query_qualities = qual + @property + def bin(self): + """genomic bin associated with this alignment.""" + return self._delegate.core.bin - property bin: - """properties bin""" - def __get__(self): - return self._delegate.core.bin - def __set__(self, bin): - self._delegate.core.bin = bin + @bin.setter + def bin(self, value): + self._delegate.core.bin = value ########################################################## @@ -1513,113 +1542,149 @@ cdef class AlignedSegment: ########################################################## # 1. Flags ########################################################## - property is_paired: - """true if read is paired in sequencing""" - def __get__(self): - return (self.flag & BAM_FPAIRED) != 0 - def __set__(self,val): - pysam_update_flag(self._delegate, val, BAM_FPAIRED) - - property is_proper_pair: - """true if read is mapped in a proper pair""" - def __get__(self): - return (self.flag & BAM_FPROPER_PAIR) != 0 - def __set__(self,val): - pysam_update_flag(self._delegate, val, BAM_FPROPER_PAIR) - property is_unmapped: - """true if read itself is unmapped""" - def __get__(self): - return (self.flag & BAM_FUNMAP) != 0 - def __set__(self, val): - pysam_update_flag(self._delegate, val, BAM_FUNMAP) - # setting the unmapped flag requires recalculation of - # bin as alignment length is now implicitely 1 - update_bin(self._delegate) - - property mate_is_unmapped: - """true if the mate is unmapped""" - def __get__(self): - return (self.flag & BAM_FMUNMAP) != 0 - def __set__(self,val): - pysam_update_flag(self._delegate, val, BAM_FMUNMAP) - property is_reverse: - """true if read is mapped to reverse strand""" - def __get__(self): - return (self.flag & BAM_FREVERSE) != 0 - def __set__(self,val): - pysam_update_flag(self._delegate, val, BAM_FREVERSE) - property mate_is_reverse: - """true is read is mapped to reverse strand""" - def __get__(self): - return (self.flag & BAM_FMREVERSE) != 0 - def __set__(self,val): - pysam_update_flag(self._delegate, val, BAM_FMREVERSE) - property is_read1: - """true if this is read1""" - def __get__(self): - return (self.flag & BAM_FREAD1) != 0 - def __set__(self,val): - pysam_update_flag(self._delegate, val, BAM_FREAD1) - property is_read2: - """true if this is read2""" - def __get__(self): - return (self.flag & BAM_FREAD2) != 0 - def __set__(self, val): - pysam_update_flag(self._delegate, val, BAM_FREAD2) - property is_secondary: - """true if not primary alignment""" - def __get__(self): - return (self.flag & BAM_FSECONDARY) != 0 - def __set__(self, val): - pysam_update_flag(self._delegate, val, BAM_FSECONDARY) - property is_qcfail: - """true if QC failure""" - def __get__(self): - return (self.flag & BAM_FQCFAIL) != 0 - def __set__(self, val): - pysam_update_flag(self._delegate, val, BAM_FQCFAIL) - property is_duplicate: - """true if optical or PCR duplicate""" - def __get__(self): - return (self.flag & BAM_FDUP) != 0 - def __set__(self, val): - pysam_update_flag(self._delegate, val, BAM_FDUP) - property is_supplementary: - """true if this is a supplementary alignment""" - def __get__(self): - return (self.flag & BAM_FSUPPLEMENTARY) != 0 - def __set__(self, val): - pysam_update_flag(self._delegate, val, BAM_FSUPPLEMENTARY) + @property + def is_paired(self): + """True if read is paired in sequencing.""" + return (self.flag & BAM_FPAIRED) != 0 + + @is_paired.setter + def is_paired(self,val): + pysam_update_flag(self._delegate, val, BAM_FPAIRED) + + @property + def is_proper_pair(self): + """True if read is mapped in a proper pair.""" + return (self.flag & BAM_FPROPER_PAIR) != 0 + + @is_proper_pair.setter + def is_proper_pair(self, val): + pysam_update_flag(self._delegate, val, BAM_FPROPER_PAIR) + + @property + def is_unmapped(self): + """True if read itself is unmapped.""" + return (self.flag & BAM_FUNMAP) != 0 + + @is_unmapped.setter + def is_unmapped(self, val): + pysam_update_flag(self._delegate, val, BAM_FUNMAP) + # setting the unmapped flag requires recalculation of + # bin as alignment length is now implicitely 1 + update_bin(self._delegate) + + @property + def mate_is_unmapped(self): + """True if the mate is unmapped.""" + return (self.flag & BAM_FMUNMAP) != 0 + + @mate_is_unmapped.setter + def mate_is_unmapped(self, val): + pysam_update_flag(self._delegate, val, BAM_FMUNMAP) + + @property + def is_reverse(self): + """True if read is mapped to reverse strand.""" + return (self.flag & BAM_FREVERSE) != 0 + + @is_reverse.setter + def is_reverse(self,val): + pysam_update_flag(self._delegate, val, BAM_FREVERSE) + + @property + def mate_is_reverse(self): + """True is read is mapped to reverse strand.""" + return (self.flag & BAM_FMREVERSE) != 0 + + @mate_is_reverse.setter + def mate_is_reverse(self, val): + pysam_update_flag(self._delegate, val, BAM_FMREVERSE) + + @property + def is_read1(self): + """true if this is read1.""" + return (self.flag & BAM_FREAD1) != 0 + + @is_read1.setter + def is_read1(self, val): + pysam_update_flag(self._delegate, val, BAM_FREAD1) + + @property + def is_read2(self): + """true if this is read2.""" + return (self.flag & BAM_FREAD2) != 0 + + @is_read2.setter + def is_read2(self, val): + pysam_update_flag(self._delegate, val, BAM_FREAD2) + + @property + def is_secondary(self): + """true if not primary alignment.""" + return (self.flag & BAM_FSECONDARY) != 0 + + @is_secondary.setter + def is_secondary(self, val): + pysam_update_flag(self._delegate, val, BAM_FSECONDARY) + + @property + def is_qcfail(self): + """true if QC failure.""" + return (self.flag & BAM_FQCFAIL) != 0 + + @is_qcfail.setter + def is_qcfail(self, val): + pysam_update_flag(self._delegate, val, BAM_FQCFAIL) + + @property + def is_duplicate(self): + """true if optical or PCR duplicate.""" + return (self.flag & BAM_FDUP) != 0 + + @is_duplicate.setter + def is_duplicate(self, val): + pysam_update_flag(self._delegate, val, BAM_FDUP) + + @property + def is_supplementary(self): + """true if this is a supplementary alignment.""" + return (self.flag & BAM_FSUPPLEMENTARY) != 0 + + @is_supplementary.setter + def is_supplementary(self, val): + pysam_update_flag(self._delegate, val, BAM_FSUPPLEMENTARY) # 2. Coordinates and lengths - property reference_end: - '''aligned reference position of the read on the reference genome. + @property + def reference_end(self): + """aligned reference position of the read on the reference genome. reference_end points to one past the last aligned residue. Returns None if not available (read is unmapped or no cigar alignment present). - ''' - def __get__(self): - cdef bam1_t * src - src = self._delegate - if (self.flag & BAM_FUNMAP) or pysam_get_n_cigar(src) == 0: - return None - return bam_endpos(src) - - property reference_length: - '''aligned length of the read on the reference genome. - - This is equal to `aend - pos`. Returns None if not available.''' - def __get__(self): - cdef bam1_t * src - src = self._delegate - if (self.flag & BAM_FUNMAP) or pysam_get_n_cigar(src) == 0: - return None - return bam_endpos(src) - \ - self._delegate.core.pos - - property query_alignment_sequence: + """ + cdef bam1_t * src + src = self._delegate + if (self.flag & BAM_FUNMAP) or pysam_get_n_cigar(src) == 0: + return None + return bam_endpos(src) + + @property + def reference_length(self): + """aligned length of the read on the reference genome. + + This is equal to `aend - pos`. Returns None if not available. + + """ + cdef bam1_t * src + src = self._delegate + if (self.flag & BAM_FUNMAP) or pysam_get_n_cigar(src) == 0: + return None + return bam_endpos(src) - \ + self._delegate.core.pos + + @property + def reference_length(self): """aligned portion of the read. This is a substring of :attr:`seq` that excludes flanking @@ -1635,28 +1700,29 @@ cdef class AlignedSegment: may have been retained. """ + if self.cache_query_alignment_sequence: + return self.cache_query_alignment_sequence - def __get__(self): - if self.cache_query_alignment_sequence: - return self.cache_query_alignment_sequence + cdef bam1_t * src + cdef uint32_t start, end - cdef bam1_t * src - cdef uint32_t start, end + src = self._delegate - src = self._delegate + if src.core.l_qseq == 0: + return None - if src.core.l_qseq == 0: - return None + start = getQueryStart(src) + end = getQueryEnd(src) - start = getQueryStart(src) - end = getQueryEnd(src) + self.cache_query_alignment_sequence = force_str( + getSequenceInRange(src, start, end)) + return self.cache_query_alignment_sequence - self.cache_query_alignment_sequence = force_str( - getSequenceInRange(src, start, end)) - return self.cache_query_alignment_sequence + @property + def query_alignment_qualities(self): + """aligned query sequence quality values (None if not present). - property query_alignment_qualities: - """aligned query sequence quality values (None if not present). These + These are the quality values that correspond to :attr:`query`, that is, they exclude qualities of :term:`soft clipped` bases. This is equal to ``qual[qstart:qend]``. @@ -1669,59 +1735,61 @@ cdef class AlignedSegment: This property is read-only. """ - def __get__(self): + if self.cache_query_alignment_qualities: + return self.cache_query_alignment_qualities - if self.cache_query_alignment_qualities: - return self.cache_query_alignment_qualities + cdef bam1_t * src + cdef uint32_t start, end - cdef bam1_t * src - cdef uint32_t start, end + src = self._delegate - src = self._delegate + if src.core.l_qseq == 0: + return None - if src.core.l_qseq == 0: - return None + start = getQueryStart(src) + end = getQueryEnd(src) - start = getQueryStart(src) - end = getQueryEnd(src) - self.cache_query_alignment_qualities = \ - getQualitiesInRange(src, start, end) - return self.cache_query_alignment_qualities + self.cache_query_alignment_qualities = getQualitiesInRange(src, start, end) + return self.cache_query_alignment_qualities - property query_alignment_start: + @property + def query_alignment_start(self): """start index of the aligned query portion of the sequence (0-based, inclusive). This the index of the first base in :attr:`seq` that is not soft-clipped. + """ - def __get__(self): - return getQueryStart(self._delegate) + return getQueryStart(self._delegate) - property query_alignment_end: + @property + def query_alignment_end(self): """end index of the aligned query portion of the sequence (0-based, exclusive) This the index just past the last base in :attr:`seq` that is not soft-clipped. + """ - def __get__(self): - return getQueryEnd(self._delegate) + return getQueryEnd(self._delegate) - property query_alignment_length: + @property + def query_alignment_length(self): """length of the aligned query sequence. - This is equal to :attr:`qend` - :attr:`qstart`""" - def __get__(self): - cdef bam1_t * src - src = self._delegate - return getQueryEnd(src) - getQueryStart(src) + This is equal to :attr:`qend` - :attr:`qstart` + + """ + cdef bam1_t * src + src = self._delegate + return getQueryEnd(src) - getQueryStart(src) ##################################################### # Computed properties def get_reference_positions(self, full_length=False): - """a list of reference positions that this read aligns to. + """list of reference positions to which this read is aligned. By default, this method only returns positions in the reference that are within the alignment. If *full_length* is @@ -1768,6 +1836,7 @@ cdef class AlignedSegment: including hard-clipped bases. Returns None if CIGAR alignment is not present. + """ cdef int32_t l = calculateQueryLengthWithHardClipping(self._delegate) if l > 0: @@ -1791,6 +1860,7 @@ cdef class AlignedSegment: complemented. Returns None if the record has no query sequence. + """ if self.query_sequence is None: return None @@ -1803,6 +1873,7 @@ cdef class AlignedSegment: """return base qualities of the read sequence. Reads mapping to the reverse strand will be reversed. + """ if self.is_reverse: return self.query_qualities[::-1] @@ -1846,7 +1917,6 @@ cdef class AlignedSegment: # read sequence, cigar and MD tag are consistent. if _with_seq: - # force_str required for py2/py3 compatibility ref_seq = force_str(build_reference_sequence(src)) if ref_seq is None: raise ValueError("MD tag not present") @@ -1934,8 +2004,8 @@ cdef class AlignedSegment: might be directly adjacent. This happens if the two blocks are separated by an insertion in the read. - """ + """ cdef uint32_t k, pos, l cdef int op cdef uint32_t * cigar_p @@ -1966,6 +2036,7 @@ cdef class AlignedSegment: *start* and *end* on the reference sequence. Return None if cigar alignment is not available. + """ cdef uint32_t k, i, pos, overlap cdef int op, o @@ -2034,7 +2105,6 @@ cdef class AlignedSegment: for each cigar operation. """ - cdef int nfields = NCIGAR_CODES + 1 cdef c_array.array base_counts = array.array( @@ -2070,9 +2140,11 @@ cdef class AlignedSegment: ##################################################### ## Unsorted as yet # TODO: capture in CIGAR object - property cigartuples: - """the :term:`cigar` alignment. The alignment - is returned as a list of tuples of (operation, length). + @property + def query_alignment_start(self): + """query alignment start position. + + The alignment is returned as a list of tuples of (operation, length). If the alignment is not present, None is returned. @@ -2109,73 +2181,70 @@ cdef class AlignedSegment: To unset the cigar property, assign an empty list or None. + """ - def __get__(self): - cdef uint32_t * cigar_p - cdef bam1_t * src - cdef uint32_t op, l - cdef uint32_t k + cdef uint32_t * cigar_p + cdef bam1_t * src + cdef uint32_t op, l + cdef uint32_t k - src = self._delegate - if pysam_get_n_cigar(src) == 0: - return None + src = self._delegate + if pysam_get_n_cigar(src) == 0: + return None - cigar = [] + cigar = [] - cigar_p = pysam_bam_get_cigar(src); - for k from 0 <= k < pysam_get_n_cigar(src): - op = cigar_p[k] & BAM_CIGAR_MASK - l = cigar_p[k] >> BAM_CIGAR_SHIFT - cigar.append((op, l)) - return cigar + cigar_p = pysam_bam_get_cigar(src); + for k from 0 <= k < pysam_get_n_cigar(src): + op = cigar_p[k] & BAM_CIGAR_MASK + l = cigar_p[k] >> BAM_CIGAR_SHIFT + cigar.append((op, l)) + return cigar - def __set__(self, values): - cdef uint32_t * p - cdef bam1_t * src - cdef op, l - cdef int k + @query_alignment_start.setter + def query_alignment_start(self, values): + cdef uint32_t * p + cdef bam1_t * src + cdef op, l + cdef int k - k = 0 + k = 0 - src = self._delegate + src = self._delegate - # get location of cigar string - p = pysam_bam_get_cigar(src) + # get location of cigar string + p = pysam_bam_get_cigar(src) - # empty values for cigar string - if values is None: - values = [] + # empty values for cigar string + if values is None: + values = [] - cdef uint32_t ncigar = len(values) + cdef uint32_t ncigar = len(values) - cdef bam1_t * retval = pysam_bam_update(src, - pysam_get_n_cigar(src) * 4, - ncigar * 4, - p) + cdef bam1_t * retval = pysam_bam_update(src, + pysam_get_n_cigar(src) * 4, + ncigar * 4, + p) - if retval == NULL: - raise MemoryError("could not allocate memory") + if retval == NULL: + raise MemoryError("could not allocate memory") - # length is number of cigar operations, not bytes - pysam_set_n_cigar(src, ncigar) + # length is number of cigar operations, not bytes + pysam_set_n_cigar(src, ncigar) - # re-acquire pointer to location in memory - # as it might have moved - p = pysam_bam_get_cigar(src) + # re-acquire pointer to location in memory + # as it might have moved + p = pysam_bam_get_cigar(src) - # insert cigar operations - for op, l in values: - p[k] = l << BAM_CIGAR_SHIFT | op - k += 1 + # insert cigar operations + for op, l in values: + p[k] = l << BAM_CIGAR_SHIFT | op + k += 1 - ## setting the cigar string requires updating the bin - update_bin(src) + ## setting the cigar string requires updating the bin + update_bin(src) - cpdef set_tag(self, - tag, - value, - value_type=None, - replace=True): + cpdef set_tag(self, tag, value, value_type=None, replace=True): """sets a particular field *tag* to *value* in the optional alignment section. @@ -2216,8 +2285,8 @@ cdef class AlignedSegment: Note that a single character string will be output as 'Z' and not 'A' as the former is the more general type. - """ + """ cdef int value_size cdef uint8_t tc cdef uint8_t * value_ptr @@ -2254,54 +2323,54 @@ cdef class AlignedSegment: value, value_type)) # sam_format1 for typecasting - if typecode == 'Z': + if typecode == b'Z': value = force_bytes(value) value_ptr = value value_size = len(value)+1 - elif typecode == 'H': + elif typecode == b'H': # Note that hex tags are stored the very same # way as Z string.s value = force_bytes(value) value_ptr = value value_size = len(value)+1 - elif typecode == 'A' or typecode == 'a': + elif typecode == b'A' or typecode == b'a': value = force_bytes(value) value_ptr = value value_size = sizeof(char) - typecode = 'A' - elif typecode == 'i': + typecode = b'A' + elif typecode == b'i': int32_t_value = value value_ptr = &int32_t_value value_size = sizeof(int32_t) - elif typecode == 'I': + elif typecode == b'I': uint32_t_value = value value_ptr = &uint32_t_value value_size = sizeof(uint32_t) - elif typecode == 's': + elif typecode == b's': int16_t_value = value value_ptr = &int16_t_value value_size = sizeof(int16_t) - elif typecode == 'S': + elif typecode == b'S': uint16_t_value = value value_ptr = &uint16_t_value value_size = sizeof(uint16_t) - elif typecode == 'c': + elif typecode == b'c': int8_t_value = value value_ptr = &int8_t_value value_size = sizeof(int8_t) - elif typecode == 'C': + elif typecode == b'C': uint8_t_value = value value_ptr = &uint8_t_value value_size = sizeof(uint8_t) - elif typecode == 'd': + elif typecode == b'd': double_value = value value_ptr = &double_value value_size = sizeof(double) - elif typecode == 'f': + elif typecode == b'f': float_value = value value_ptr = &float_value value_size = sizeof(float) - elif typecode == 'B': + elif typecode == b'B': # the following goes through python, needs to be cleaned up # pack array using struct fmt, args = pack_tags([(tag, value, value_type)]) @@ -2332,8 +2401,7 @@ cdef class AlignedSegment: value_ptr) cpdef has_tag(self, tag): - """returns true if the optional alignment section - contains a given *tag*.""" + """returns true if the optional alignment section contains a given *tag*.""" cdef uint8_t * v cdef int nvalues btag = force_bytes(tag) @@ -2341,8 +2409,7 @@ cdef class AlignedSegment: return v != NULL cpdef get_tag(self, tag, with_value_type=False): - """ - retrieves data from the optional alignment section + """retrieve data from the optional alignment section given a two-letter *tag* denoting the field. The returned value is cast into an appropriate python type. @@ -2380,12 +2447,13 @@ cdef class AlignedSegment: v = bam_aux_get(self._delegate, btag) if v == NULL: raise KeyError("tag '%s' not present" % tag) - if chr(v[0]) == "B": + + if chr(v[0]) == b'B': auxtype = chr(v[0]) + chr(v[1]) else: auxtype = chr(v[0]) - if auxtype in "iIcCsS": + if auxtype in b'iIcCsS': value = bam_aux2i(v) elif auxtype == 'f' or auxtype == 'F': value = bam_aux2f(v) @@ -2393,14 +2461,14 @@ cdef class AlignedSegment: value = bam_aux2f(v) elif auxtype == 'A' or auxtype == 'a': # force A to a - v[0] = 'A' + v[0] = b'A' # there might a more efficient way # to convert a char into a string value = '%c' % bam_aux2A(v) - elif auxtype == 'Z' or auxtype == 'H': + elif auxtype == b'Z' or auxtype == b'H': # Z and H are treated equally as strings in htslib value = charptr_to_str(bam_aux2Z(v)) - elif auxtype[0] == 'B': + elif auxtype[0] == b'B': bytesize, nvalues, values = convert_binary_tag(v + 1) value = values else: @@ -2430,7 +2498,6 @@ cdef class AlignedSegment: :meth:`get_tag` for a quicker way to achieve this. """ - cdef char * ctag cdef bam1_t * src cdef uint8_t * s @@ -2451,29 +2518,29 @@ cdef class AlignedSegment: auxtag[1] = s[1] s += 2 auxtype = s[0] - if auxtype in ('c', 'C'): + if auxtype in (b'c', b'C'): value = bam_aux2i(s) s += 1 - elif auxtype in ('s', 'S'): + elif auxtype in (b's', b'S'): value = bam_aux2i(s) s += 2 - elif auxtype in ('i', 'I'): + elif auxtype in (b'i', b'I'): value = bam_aux2i(s) s += 4 - elif auxtype == 'f': + elif auxtype == b'f': value = bam_aux2f(s) s += 4 - elif auxtype == 'd': + elif auxtype == b'd': value = bam_aux2f(s) s += 8 - elif auxtype in ('A', 'a'): + elif auxtype in (b'A', b'a'): value = "%c" % bam_aux2A(s) s += 1 - elif auxtype in ('Z', 'H'): + elif auxtype in (b'Z', b'H'): value = charptr_to_str(bam_aux2Z(s)) # +1 for NULL terminated string s += len(value) + 1 - elif auxtype == 'B': + elif auxtype == b'B': s += 1 byte_size, nvalues, value = convert_binary_tag(s) # 5 for 1 char and 1 int @@ -2502,8 +2569,8 @@ cdef class AlignedSegment: This method will not enforce the rule that the same tag may appear only once in the optional alignment section. - """ + """ cdef bam1_t * src cdef uint8_t * s cdef char * temp @@ -2548,14 +2615,15 @@ cdef class AlignedSegment: cdef class PileupColumn: - '''A pileup of reads at a particular reference sequence position + """A pileup of reads at a particular reference sequence position (:term:`column`). A pileup column contains all the reads that map to a certain target base. This class is a proxy for results returned by the samtools pileup engine. If the underlying engine iterator advances, the results of this column will change. - ''' + + """ def __init__(self): raise TypeError("this class cannot be instantiated from Python") @@ -2571,346 +2639,79 @@ cdef class PileupColumn: free(self.buf.s) def set_min_base_quality(self, min_base_quality): - """set the minimum base quality for this pileup column. - """ + """set the minimum base quality for this pileup column.""" self.min_base_quality = min_base_quality def __len__(self): """return number of reads aligned to this column. see :meth:`get_num_aligned` + """ return self.get_num_aligned() - property reference_id: - '''the reference sequence number as defined in the header''' - def __get__(self): - return self.tid + @property + def reference_id(self): + """reference sequence number as defined in the header.""" + return self.tid - property reference_name: - """:term:`reference` name (None if no AlignmentFile is associated)""" - def __get__(self): - if self.header is not None: - return self.header.get_reference_name(self.tid) + @property + def reference_name(self): + """:term:`reference` name (None if no AlignmentFile is associated).""" + if self.header is None: return None - property nsegments: - '''number of reads mapping to this column. - - Note that this number ignores the base quality filter.''' - def __get__(self): - return self.n_pu - def __set__(self, n): - self.n_pu = n - - property reference_pos: - '''the position in the reference sequence (0-based).''' - def __get__(self): - return self.pos - - property pileups: - '''list of reads (:class:`pysam.PileupRead`) aligned to this column''' - def __get__(self): - if self.plp == NULL or self.plp[0] == NULL: - raise ValueError("PileupColumn accessed after iterator finished") - - cdef int x - cdef bam_pileup1_t * p = NULL - pileups = [] - - # warning: there could be problems if self.n and self.buf are - # out of sync. - for x from 0 <= x < self.n_pu: - p = &(self.plp[0][x]) - if p == NULL: - raise ValueError( - "pileup buffer out of sync - most likely use of iterator " - "outside loop") - if pileup_base_qual_skip(p, self.min_base_quality): - continue - pileups.append(makePileupRead(p, self.header)) - return pileups - - ######################################################## - # Compatibility Accessors - # Functions, properties for compatibility with pysam < 0.8 - ######################################################## - def get_num_aligned(self): - """return number of aligned bases at pileup column position. - - This method applies a base quality filter and the number is - equal to the size of :meth:`get_query_sequences`, - :meth:`get_mapping_qualities`, etc. - - """ - cdef uint32_t x = 0 - cdef uint32_t c = 0 - cdef uint32_t cnt = 0 - cdef bam_pileup1_t * p = NULL - if self.plp == NULL or self.plp[0] == NULL: - raise ValueError("PileupColumn accessed after iterator finished") - - for x from 0 <= x < self.n_pu: - p = &(self.plp[0][x]) - if p == NULL: - raise ValueError( - "pileup buffer out of sync - most likely use of iterator " - "outside loop") - if pileup_base_qual_skip(p, self.min_base_quality): - continue - cnt += 1 - return cnt - - def get_query_sequences(self, bint mark_matches=False, bint mark_ends=False, bint add_indels=False): - """query bases/sequences at pileup column position. - - Optionally, the bases/sequences can be annotated according to the samtools - mpileup format. This is the format description from the samtools mpileup tool:: - - Information on match, mismatch, indel, strand, mapping - quality and start and end of a read are all encoded at the - read base column. At this column, a dot stands for a match - to the reference base on the forward strand, a comma for a - match on the reverse strand, a '>' or '<' for a reference - skip, `ACGTN' for a mismatch on the forward strand and - `acgtn' for a mismatch on the reverse strand. A pattern - `\\+[0-9]+[ACGTNacgtn]+' indicates there is an insertion - between this reference position and the next reference - position. The length of the insertion is given by the - integer in the pattern, followed by the inserted - sequence. Similarly, a pattern `-[0-9]+[ACGTNacgtn]+' - represents a deletion from the reference. The deleted bases - will be presented as `*' in the following lines. Also at - the read base column, a symbol `^' marks the start of a - read. The ASCII of the character following `^' minus 33 - gives the mapping quality. A symbol `$' marks the end of a - read segment - - To reproduce samtools mpileup format, set all of mark_matches, - mark_ends and add_indels to True. - - Parameters - ---------- - - mark_matches: bool - - If True, output bases matching the reference as "," or "." - for forward and reverse strand, respectively. This mark - requires the reference sequence. If no reference is - present, this option is ignored. - - mark_ends : bool - - If True, add markers "^" and "$" for read start and end, respectively. - - add_indels : bool - - If True, add bases for bases inserted into the reference and - 'N's for base skipped from the reference. If a reference sequence - is given, add the actual bases. - - Returns - ------- - - list: a list of bases/sequences per read at pileup column position. - - """ - cdef uint32_t x = 0 - cdef uint32_t j = 0 - cdef uint32_t c = 0 - cdef uint8_t cc = 0 - cdef uint8_t rb = 0 - cdef kstring_t * buf = &self.buf - cdef bam_pileup1_t * p = NULL - - if self.plp == NULL or self.plp[0] == NULL: - raise ValueError("PileupColumn accessed after iterator finished") - - buf.l = 0 - - # todo: reference sequence to count matches/mismatches - # todo: convert assertions to exceptions - for x from 0 <= x < self.n_pu: - p = &(self.plp[0][x]) - if p == NULL: - raise ValueError( - "pileup buffer out of sync - most likely use of iterator " - "outside loop") - if pileup_base_qual_skip(p, self.min_base_quality): - continue - # see samtools pileup_seq - if mark_ends and p.is_head: - kputc('^', buf) - - if p.b.core.qual > 93: - kputc(126, buf) - else: - kputc(p.b.core.qual + 33, buf) - if not p.is_del: - if p.qpos < p.b.core.l_qseq: - cc = seq_nt16_str[bam_seqi(bam_get_seq(p.b), p.qpos)] - else: - cc = 'N' - - if mark_matches and self.reference_sequence != NULL: - rb = self.reference_sequence[self.reference_pos] - if seq_nt16_table[cc] == seq_nt16_table[rb]: - cc = "=" - kputc(strand_mark_char(cc, p.b), buf) - elif add_indels: - if p.is_refskip: - if bam_is_rev(p.b): - kputc('<', buf) - else: - kputc('>', buf) - else: - kputc('*', buf) - if add_indels: - if p.indel > 0: - kputc('+', buf) - kputw(p.indel, buf) - for j from 1 <= j <= p.indel: - cc = seq_nt16_str[bam_seqi(bam_get_seq(p.b), p.qpos + j)] - kputc(strand_mark_char(cc, p.b), buf) - elif p.indel < 0: - kputc('-', buf) - kputw(-p.indel, buf) - for j from 1 <= j <= -p.indel: - # TODO: out-of-range check here? - if self.reference_sequence == NULL: - cc = 'N' - else: - cc = self.reference_sequence[self.reference_pos + j] - kputc(strand_mark_char(cc, p.b), buf) - if mark_ends and p.is_tail: - kputc('$', buf) - - kputc(':', buf) - - if buf.l == 0: - # could be zero if all qualities are too low - return "" - else: - # quicker to ensemble all and split than to encode all separately. - # ignore last ":" - return force_str(PyBytes_FromStringAndSize(buf.s, buf.l-1)).split(":") - - def get_query_qualities(self): - """query base quality scores at pileup column position. - - Returns - ------- - - list: a list of quality scores - """ - cdef uint32_t x = 0 - cdef bam_pileup1_t * p = NULL - cdef uint32_t c = 0 - result = [] - for x from 0 <= x < self.n_pu: - p = &(self.plp[0][x]) - if p == NULL: - raise ValueError( - "pileup buffer out of sync - most likely use of iterator " - "outside loop") + return self.header.get_reference_name(self.tid) - if p.qpos < p.b.core.l_qseq: - c = bam_get_qual(p.b)[p.qpos] - else: - c = 0 - if c < self.min_base_quality: - continue - result.append(c) - return result - - def get_mapping_qualities(self): - """query mapping quality scores at pileup column position. + @property + def reference_id(self): + """number of reads mapping to this column. - Returns - ------- + Note that this number ignores the base quality filter. - list: a list of quality scores """ - if self.plp == NULL or self.plp[0] == NULL: - raise ValueError("PileupColumn accessed after iterator finished") - - cdef uint32_t x = 0 - cdef bam_pileup1_t * p = NULL - result = [] - for x from 0 <= x < self.n_pu: - p = &(self.plp[0][x]) - if p == NULL: - raise ValueError( - "pileup buffer out of sync - most likely use of iterator " - "outside loop") + return self.n_pu - if pileup_base_qual_skip(p, self.min_base_quality): - continue - result.append(p.b.core.qual) - return result + @reference_id.setter + def reference_id(self, n): + self.n_pu = n - def get_query_positions(self): - """positions in read at pileup column position. + @property + def reference_pos(self): + """the position in the reference sequence (0-based).""" + return self.pos - Returns - ------- - - list: a list of read positions - """ + @property + def pileups(self): + """list of reads (:class:`pysam.PileupRead`) aligned to this column.""" if self.plp == NULL or self.plp[0] == NULL: raise ValueError("PileupColumn accessed after iterator finished") - cdef uint32_t x = 0 + cdef int x cdef bam_pileup1_t * p = NULL - result = [] - for x from 0 <= x < self.n_pu: - p = &(self.plp[0][x]) - if p == NULL: - raise ValueError( - "pileup buffer out of sync - most likely use of iterator " - "outside loop") - - if pileup_base_qual_skip(p, self.min_base_quality): - continue - result.append(p.qpos) - return result + pileups = [] - def get_query_names(self): - """query/read names aligned at pileup column position. - - Returns - ------- - - list: a list of query names at pileup column position. - """ - if self.plp == NULL or self.plp[0] == NULL: - raise ValueError("PileupColumn accessed after iterator finished") - - cdef uint32_t x = 0 - cdef bam_pileup1_t * p = NULL - result = [] + # warning: there could be problems if self.n and self.buf are + # out of sync. for x from 0 <= x < self.n_pu: p = &(self.plp[0][x]) if p == NULL: raise ValueError( "pileup buffer out of sync - most likely use of iterator " "outside loop") - if pileup_base_qual_skip(p, self.min_base_quality): continue - result.append(charptr_to_str(pysam_bam_get_qname(p.b))) - return result + pileups.append(makePileupRead(p, self.header)) + return pileups cdef class PileupRead: - '''Representation of a read aligned to a particular position in the + """Representation of a read aligned to a particular position in the reference sequence. - ''' - + """ def __init__(self): - raise TypeError( - "this class cannot be instantiated from Python") + raise TypeError("this class cannot be instantiated from Python") def __str__(self): return "\t".join( @@ -2920,33 +2721,34 @@ cdef class PileupRead: self.is_del, self.is_head, self.is_tail, self.is_refskip))) - property alignment: - """a :class:`pysam.AlignedSegment` object of the aligned read""" - def __get__(self): - return self._alignment + @property + def alignment(self): + """a :class:`pysam.AlignedSegment` object of the aligned read.""" + return self._alignment - property query_position: + @property + def query_position(self): """position of the read base at the pileup site, 0-based. None if is_del or is_refskip is set. """ - def __get__(self): - if self.is_del or self.is_refskip: - return None - else: - return self._qpos + if self.is_del or self.is_refskip: + return None + else: + return self._qpos - property query_position_or_next: + @property + def query_position_or_next(self): """position of the read base at the pileup site, 0-based. If the current position is a deletion, returns the next aligned base. """ - def __get__(self): - return self._qpos + return self._qpos - property indel: + @property + def indel(self): """indel length for the position following the current pileup site. This quantity peeks ahead to the next cigar operation in this @@ -2955,34 +2757,36 @@ cdef class PileupRead: negation. 0 if the next operation is not an indel. """ - def __get__(self): - return self._indel + return self._indel + + @property + def level(self): + """the level of the read in the "viewer" mode. - property level: - """the level of the read in the "viewer" mode. Note that this value - is currently not computed.""" - def __get__(self): - return self._level + Note that this value is currently not computed. + + """ + return self._level - property is_del: - """1 iff the base on the padded read is a deletion""" - def __get__(self): - return self._is_del + @property + def is_del(self): + """True iff the base on the padded read is a deletion.""" + return bool(self._is_del) - property is_head: - """1 iff the base on the padded read is the left-most base.""" - def __get__(self): - return self._is_head + @property + def is_head(self): + """True iff the base on the padded read is the left-most base.""" + return bool(self._is_head) - property is_tail: - """1 iff the base on the padded read is the right-most base.""" - def __get__(self): - return self._is_tail + @property + def is_tail(self): + """True iff the base on the padded read is the right-most base.""" + return bool(self._is_tail) - property is_refskip: - """1 iff the base on the padded read is part of CIGAR N op.""" - def __get__(self): - return self._is_refskip + @property + def is_refskip(self): + """True iff the base on the padded read is part of CIGAR N op.""" + return bool(self._is_refskip) @@ -3052,4 +2856,5 @@ __all__ = [ "FQCFAIL", "FDUP", "FSUPPLEMENTARY", - "KEY_NAMES"] + "KEY_NAMES", +] diff --git a/pysam/libcalignmentfile.pxd b/pysam/libcalignmentfile.pxd index e8e2081..4515e36 100644 --- a/pysam/libcalignmentfile.pxd +++ b/pysam/libcalignmentfile.pxd @@ -1,3 +1,5 @@ +# cython: language_level=3, embedsignature=True, profile=True + from libc.stdint cimport int8_t, int16_t, int32_t, int64_t from libc.stdint cimport uint8_t, uint16_t, uint32_t, uint64_t from libc.stdlib cimport malloc, calloc, realloc, free @@ -143,8 +145,6 @@ cdef class IteratorColumn: cdef reset(self, tid, start, stop) cdef _free_pileup_iter(self) - # backwards compatibility - cdef char * getSequence(self) cdef class IteratorColumnRegion(IteratorColumn): diff --git a/pysam/libcalignmentfile.pyx b/pysam/libcalignmentfile.pyx index 753381d..31766e3 100644 --- a/pysam/libcalignmentfile.pyx +++ b/pysam/libcalignmentfile.pyx @@ -1,6 +1,4 @@ -# cython: embedsignature=True -# cython: profile=True -######################################################## +# cython: language_level=3, embedsignature=True, profile=True ######################################################## # Cython wrapper for SAM/BAM/CRAM files based on htslib ######################################################## @@ -98,43 +96,59 @@ IndexStats = collections.namedtuple("IndexStats", cdef int MAX_POS = (1 << 31) - 1 # valid types for SAM headers -VALID_HEADER_TYPES = {"HD" : Mapping, - "SQ" : Sequence, - "RG" : Sequence, - "PG" : Sequence, - "CO" : Sequence} +VALID_HEADER_TYPES = { + "HD": Mapping, + "SQ": Sequence, + "RG": Sequence, + "PG": Sequence, + "CO": Sequence, +} # order of records within SAM headers VALID_HEADERS = ("HD", "SQ", "RG", "PG", "CO") # default type conversions within SAM header records -KNOWN_HEADER_FIELDS = {"HD" : {"VN" : str, "SO" : str, "GO" : str}, - "SQ" : {"SN" : str, "LN" : int, "AS" : str, - "M5" : str, "SP" : str, "UR" : str, - "AH" : str,}, - "RG" : {"ID" : str, "CN" : str, "DS" : str, - "DT" : str, "FO" : str, "KS" : str, - "LB" : str, "PG" : str, "PI" : str, - "PL" : str, "PM" : str, "PU" : str, - "SM" : str,}, - "PG" : {"ID" : str, "PN" : str, "CL" : str, - "PP" : str, "DS" : str, "VN" : str,},} +KNOWN_HEADER_FIELDS = { + "HD": {"VN": str, "SO": str, "GO": str}, + "SQ": { + "SN": str, + "LN": int, + "AS": str, + "M5": str, + "SP": str, + "UR": str, + "AH": str, + }, + "RG": { + "ID": str, + "CN": str, + "DS": str, + "DT": str, + "FO": str, + "KS": str, + "LB": str, + "PG": str, + "PI": str, + "PL": str, + "PM": str, + "PU": str, + "SM": str, + }, + "PG": {"ID": str, "PN": str, "CL": str, "PP": str, "DS": str, "VN": str}, +} # output order of fields within records. Ensure that CL is at # the end as parsing a CL will ignore any subsequent records. -VALID_HEADER_ORDER = {"HD" : ("VN", "SO", "GO"), - "SQ" : ("SN", "LN", "AS", "M5", - "UR", "SP", "AH"), - "RG" : ("ID", "CN", "SM", "LB", - "PU", "PI", "DT", "DS", - "PL", "FO", "KS", "PG", - "PM"), - "PG" : ("PN", "ID", "VN", "PP", - "DS", "CL"),} +VALID_HEADER_ORDER = { + "HD" : ("VN", "SO", "GO"), + "SQ" : ("SN", "LN", "AS", "M5", "UR", "SP", "AH"), + "RG" : ("ID", "CN", "SM", "LB", "PU", "PI", "DT", "DS", "PL", "FO", "KS", "PG", "PM"), + "PG" : ("PN", "ID", "VN", "PP", "DS", "CL"), +} def build_header_line(fields, record): - '''build a header line from `fields` dictionary for `record`''' + """build a header line from `fields` dictionary for `record`""" # TODO: add checking for field and sort order line = ["@%s" % record] @@ -177,8 +191,7 @@ cdef int fill_AlignmentHeader_from_list(bam_hdr_t *dest, reference_lengths, add_sq_text=True, text=None) except -1: - """build header from list of reference names and lengths. - """ + """build header from list of reference names and lengths.""" cdef class AlignmentHeader(object): """header information for a :class:`AlignmentFile` object @@ -211,7 +224,6 @@ cdef class AlignmentHeader(object): construction :term:`SAM` formatted files without a header. """ - # See makeVariantHeader for C constructor def __cinit__(self): self.ptr = NULL @@ -344,34 +356,42 @@ cdef class AlignmentHeader(object): def copy(self): return makeAlignmentHeader(bam_hdr_dup(self.ptr)) - property nreferences: + @property + def nreferences(self): """int with the number of :term:`reference` sequences in the file. - This is a read-only attribute.""" - def __get__(self): - return self.ptr.n_targets - - property references: - """tuple with the names of :term:`reference` sequences. This is a - read-only attribute""" - def __get__(self): - t = [] - cdef int x - for x in range(self.ptr.n_targets): - t.append(charptr_to_str(self.ptr.target_name[x])) - return tuple(t) - - property lengths: - """tuple of the lengths of the :term:`reference` sequences. This is a - read-only attribute. The lengths are in the same order as - :attr:`pysam.AlignmentFile.references` + This is a read-only property. + + """ + return self.ptr.n_targets + + @property + def references(self): + """tuple with the names of :term:`reference` sequences. + + This is a read-only property. + + """ + t = [] + cdef int x + for x in range(self.ptr.n_targets): + t.append(charptr_to_str(self.ptr.target_name[x])) + return tuple(t) + + @property + def lengths(self): + """tuple of the lengths of the :term:`reference` sequences. + + This is a read-only property. + + The lengths are in the same order as :attr:`pysam.AlignmentFile.references` + """ - def __get__(self): - t = [] - cdef int x - for x in range(self.ptr.n_targets): - t.append(self.ptr.target_len[x]) - return tuple(t) + t = [] + cdef int x + for x in range(self.ptr.n_targets): + t.append(self.ptr.target_len[x]) + return tuple(t) def _build_sequence_section(self): """return sequence section of header. @@ -379,8 +399,8 @@ cdef class AlignmentHeader(object): The sequence section is built from the list of reference names and lengths stored in the BAM-file and not from any @SQ entries that are part of the header's text section. - """ + """ cdef int x text = [] for x in range(self.ptr.n_targets): @@ -411,6 +431,7 @@ cdef class AlignmentHeader(object): If no @SQ entries are within the text section of the header, this will be automatically added from the reference names and lengths stored in the binary part of the header. + """ result = collections.OrderedDict() @@ -461,7 +482,8 @@ cdef class AlignmentHeader(object): result[record] = x elif VALID_HEADER_TYPES[record] == Sequence: - if record not in result: result[record] = [] + if record not in result: + result[record] = [] result[record].append(x) # if there are no SQ lines in the header, add the @@ -497,26 +519,24 @@ cdef class AlignmentHeader(object): return self.ptr.target_len[tid] def is_valid_tid(self, int tid): - """ - return True if the numerical :term:`tid` is valid; False otherwise. + """True if the numerical :term:`tid` is valid; False otherwise. Note that the unmapped tid code (-1) counts as an invalid. + """ return 0 <= tid < self.ptr.n_targets def get_tid(self, reference): - """ - return the numerical :term:`tid` corresponding to - :term:`reference` + """return the numerical :term:`tid` corresponding to :term:`reference` returns -1 if reference is not known. + """ reference = force_bytes(reference) return bam_name2id(self.ptr, reference) def __str__(self): - '''string with the full contents of the :term:`sam file` header as a - string. + """string with the full contents of the :term:`sam file` header as a string. If no @SQ entries are within the text section of the header, this will be automatically added from the reference names and @@ -524,51 +544,16 @@ cdef class AlignmentHeader(object): See :attr:`pysam.AlignmentFile.header.to_dict()` to get a parsed representation of the header. - ''' + + """ text = from_string_and_size(self.ptr.text, self.ptr.l_text) if "@SQ" not in text: text += "\n" + self._build_sequence_section() return text - # dictionary access methods, for backwards compatibility. - def __setitem__(self, key, value): - raise TypeError("AlignmentHeader does not support item assignment (use header.to_dict()") - - def __getitem__(self, key): - return self.to_dict().__getitem__(key) - - def items(self): - return self.to_dict().items() - - # PY2 compatibility - def iteritems(self): - return self.to_dict().items() - - def keys(self): - return self.to_dict().keys() - - def values(self): - return self.to_dict().values() - - def get(self, *args): - return self.to_dict().get(*args) - - def __len__(self): - return self.to_dict().__len__() - - def __contains__(self, key): - return self.to_dict().__contains__(key) - cdef class AlignmentFile(HTSFile): - """AlignmentFile(filepath_or_object, mode=None, template=None, - reference_names=None, reference_lengths=None, text=NULL, - header=None, add_sq_text=False, check_header=True, check_sq=True, - reference_filename=None, filename=None, index_filename=None, - filepath_index=None, require_index=False, duplicate_filehandle=True, - ignore_truncation=False, threads=1) - - A :term:`SAM`/:term:`BAM`/:term:`CRAM` formatted file. + """A :term:`SAM`/:term:`BAM`/:term:`CRAM` formatted file. If `filepath_or_object` is a string, the file is automatically opened. If `filepath_or_object` is a python File object, the @@ -714,8 +699,8 @@ cdef class AlignmentFile(HTSFile): Number of threads to use for compressing/decompressing BAM/CRAM files. Setting threads to > 1 cannot be combined with `ignore_truncation`. (Default=1) - """ + """ def __cinit__(self, *args, **kwargs): self.htsfile = NULL self.filename = None @@ -736,64 +721,41 @@ cdef class AlignmentFile(HTSFile): if self.b == NULL: raise MemoryError("could not allocate memory of size {}".format(sizeof(bam1_t))) + @property def has_index(self): - """return true if htsfile has an existing (and opened) index. - """ + """True if htsfile has an existing (and opened) index.""" return self._index != NULL - def check_index(self): - """return True if index is present. - - Raises - ------ - - AttributeError - if htsfile is :term:`SAM` formatted and thus has no index. - - ValueError - if htsfile is closed or index could not be opened. - """ - - if not self.is_open: - raise ValueError("I/O operation on closed file") - if not self.is_bam and not self.is_cram: - raise AttributeError( - "AlignmentFile.mapped only available in bam files") - if self._index == NULL: - raise ValueError( - "mapping information not recorded in index " - "or index not available") - return True - - def _open(self, - filepath_or_object, - mode=None, - AlignmentFile template=None, - reference_names=None, - reference_lengths=None, - reference_filename=None, - text=None, - header=None, - port=None, - add_sq_text=True, - add_sam_header=True, - check_header=True, - check_sq=True, - index_filename=None, - filepath_index=None, - require_index=False, - referencenames=None, - referencelengths=None, - duplicate_filehandle=True, - ignore_truncation=False, - format_options=None, - threads=1): - '''open a sam, bam or cram formatted file. + def _open( + self, + filepath_or_object, + mode=None, + AlignmentFile template=None, + reference_names=None, + reference_lengths=None, + reference_filename=None, + text=None, + header=None, + port=None, + add_sq_text=True, + add_sam_header=True, + check_header=True, + check_sq=True, + index_filename=None, + filepath_index=None, + require_index=False, + referencenames=None, + referencelengths=None, + duplicate_filehandle=True, + ignore_truncation=False, + format_options=None, + threads=1): + """open a sam, bam or cram formatted file. If _open is called on an existing file, the current file will be closed and a new file will be opened. - ''' + """ cdef char *cfilename = NULL cdef char *creference_filename = NULL cdef char *cindexname = NULL @@ -1079,9 +1041,8 @@ cdef class AlignmentFile(HTSFile): if self.is_bam or self.is_cram: if not until_eof and not self.is_remote: - if not self.has_index(): - raise ValueError( - "fetch called on bamfile without index") + if not self.has_index: + raise ValueError("fetch called on bamfile without index") if has_coord: return IteratorRowRegion( @@ -1130,7 +1091,7 @@ cdef class AlignmentFile(HTSFile): raise OSError("index building failed with error {}".format(retval)) def head(self, n, multiple_iterators=True): - '''return an iterator over the first n alignments. + """return an iterator over the first n alignments. This iterator is is useful for inspecting the bam-file. @@ -1147,12 +1108,11 @@ cdef class AlignmentFile(HTSFile): an iterator over a collection of reads - ''' - return IteratorRowHead(self, n, - multiple_iterators=multiple_iterators) + """ + return IteratorRowHead(self, n, multiple_iterators=multiple_iterators) def mate(self, AlignedSegment read): - '''return the mate of :class:`~pysam.AlignedSegment` `read`. + """return the mate of :class:`~pysam.AlignedSegment` `read`. .. note:: @@ -1177,7 +1137,7 @@ cdef class AlignmentFile(HTSFile): ValueError if the read is unpaired or the mate is unmapped - ''' + """ cdef uint32_t flag = read._delegate.core.flag if flag & BAM_FPAIRED == 0: @@ -1218,12 +1178,13 @@ cdef class AlignmentFile(HTSFile): reference=None, end=None, **kwargs): - """perform a :term:`pileup` within a :term:`region`. The region is - specified by :term:`contig`, `start` and `stop` (using - 0-based indexing). :term:`reference` and `end` are also accepted for - backward compatiblity as synonyms for :term:`contig` and `stop`, - respectively. Alternatively, a samtools 'region' string - can be supplied. + """Generate a :term:`pileup` within a :term:`region`. + + The region is specified by :term:`contig`, `start` and `stop` (using + 0-based indexing). :term:`reference` and `end` are also accepted + for backward compatiblity as synonyms for :term:`contig` and `stop`, + respectively. Alternatively, a samtools 'region' string can be + supplied. Without 'contig' or 'region' all reads will be used for the pileup. The reads will be returned ordered by @@ -1341,7 +1302,7 @@ cdef class AlignmentFile(HTSFile): contig, start, stop, region, reference=reference, end=end) if self.is_bam or self.is_cram: - if not self.has_index(): + if not self.has_index: raise ValueError("no index available for pileup") if has_coord: @@ -1366,7 +1327,7 @@ cdef class AlignmentFile(HTSFile): read_callback="nofilter", reference=None, end=None): - '''count the number of reads in :term:`region` + """count the number of reads in :term:`region`. The region is specified by :term:`contig`, `start` and `stop`. :term:`reference` and `end` are also accepted for backward @@ -1425,7 +1386,7 @@ cdef class AlignmentFile(HTSFile): ValueError if the genomic coordinates are out of range or invalid. - ''' + """ cdef AlignedSegment read cdef long counter = 0 @@ -1492,7 +1453,7 @@ cdef class AlignmentFile(HTSFile): end of the genomic region (0-based exclusive). If not given, count to the end of the chromosome. - region : int + region : string a region string. quality_threshold : int @@ -1534,7 +1495,6 @@ cdef class AlignmentFile(HTSFile): four array.arrays of the same length in order A C G T : tuple """ - cdef uint32_t contig_length = self.get_reference_length(contig) cdef int _start = start if start is not None else 0 cdef int _stop = stop if stop is not None else contig_length @@ -1611,13 +1571,15 @@ cdef class AlignmentFile(HTSFile): return count_a, count_c, count_g, count_t def find_introns_slow(self, read_iterator): - """Return a dictionary {(start, stop): count} + """Return a dictionary {(start, stop): count}. + Listing the intronic sites in the reads (identified by 'N' in the cigar strings), and their support ( = number of reads ). read_iterator can be the result of a .fetch(...) call. Or it can be a generator filtering such reads. Example samfile.find_introns((read for read in samfile.fetch(...) if read.is_reverse) + """ res = collections.Counter() for r in read_iterator: @@ -1635,13 +1597,15 @@ cdef class AlignmentFile(HTSFile): return res def find_introns(self, read_iterator): - """Return a dictionary {(start, stop): count} + """Return a dictionary {(start, stop): count}. + Listing the intronic sites in the reads (identified by 'N' in the cigar strings), and their support ( = number of reads ). read_iterator can be the result of a .fetch(...) call. Or it can be a generator filtering such reads. Example samfile.find_introns((read for read in samfile.fetch(...) if read.is_reverse) + """ cdef: uint32_t base_position, junc_start, nt @@ -1666,8 +1630,7 @@ cdef class AlignmentFile(HTSFile): def close(self): - '''closes the :class:`pysam.AlignmentFile`.''' - + """closes the :class:`pysam.AlignmentFile`.""" if self.htsfile == NULL: return @@ -1712,8 +1675,7 @@ cdef class AlignmentFile(HTSFile): raise IOError(errno, force_str(strerror(errno))) cpdef int write(self, AlignedSegment read) except -1: - ''' - write a single :class:`pysam.AlignedSegment` to disk. + """write a single :class:`pysam.AlignedSegment`. Raises: ValueError @@ -1723,7 +1685,8 @@ cdef class AlignmentFile(HTSFile): int : the number of bytes written. If the file is closed, this will be 0. - ''' + + """ if not self.is_open: return 0 @@ -1761,61 +1724,76 @@ cdef class AlignmentFile(HTSFile): ############################################################### ## properties ############################################################### - property mapped: - """int with total number of mapped alignments according to the - statistics recorded in the index. This is a read-only - attribute. - """ - def __get__(self): - self.check_index() - cdef int tid - cdef uint64_t total = 0 - cdef uint64_t mapped, unmapped - for tid from 0 <= tid < self.header.nreferences: - with nogil: - hts_idx_get_stat(self._index, tid, &mapped, &unmapped) - total += mapped - return total - - property unmapped: - """int with total number of unmapped reads according to the statistics - recorded in the index. This number of reads includes the number of reads - without coordinates. This is a read-only attribute. + @property + def mapped(self): + """total number of mapped alignments according to the statistics recorded in the index. + + This is a read-only property. + """ - def __get__(self): - self.check_index() - cdef int tid - cdef uint64_t total = hts_idx_get_n_no_coor(self._index) - cdef uint64_t mapped, unmapped - for tid from 0 <= tid < self.header.nreferences: - with nogil: - hts_idx_get_stat(self._index, tid, &mapped, &unmapped) - total += unmapped - return total + if not self.has_index: + raise ValueError('Index is required') + + cdef int tid + cdef uint64_t total = 0 + cdef uint64_t mapped, unmapped + for tid from 0 <= tid < self.header.nreferences: + with nogil: + hts_idx_get_stat(self._index, tid, &mapped, &unmapped) + total += mapped + return total + + @property + def unmapped(self): + """total number of unmapped reads according to the statistics recorded in the index. + + This number of reads includes the number of reads + without coordinates. + + This is a read-only property. - property nocoordinate: - """int with total number of reads without coordinates according to the - statistics recorded in the index. This is a read-only attribute. """ - def __get__(self): - self.check_index() - cdef uint64_t n + if not self.has_index: + raise ValueError('Index is required') + + cdef int tid + cdef uint64_t total = hts_idx_get_n_no_coor(self._index) + cdef uint64_t mapped, unmapped + for tid from 0 <= tid < self.header.nreferences: with nogil: - n = hts_idx_get_n_no_coor(self._index) - return n + hts_idx_get_stat(self._index, tid, &mapped, &unmapped) + total += unmapped + return total + + @property + def nocoordinate(self): + """total number of reads without coordinates according to the statistics recorded in the index. + + This is a read-only property. + + """ + if not self.has_index: + raise ValueError('Index is required') + + cdef uint64_t n + with nogil: + n = hts_idx_get_n_no_coor(self._index) + return n def get_index_statistics(self): """return statistics about mapped/unmapped reads per chromosome as they are stored in the index. Returns: - list : + list: a list of records for each chromosome. Each record has the attributes 'contig', 'mapped', 'unmapped' and 'total'. - """ - self.check_index() + """ + if not self.has_index: + raise ValueError('Index is required') cdef int tid + cdef uint64_t mapped, unmapped results = [] # TODO: use header @@ -1849,9 +1827,11 @@ cdef class AlignmentFile(HTSFile): return self.b cdef int cnext(self): - ''' - cversion of iterator. Used by :class:`pysam.AlignmentFile.IteratorColumn`. - ''' + """cversion of iterator. + + Used by :class:`pysam.AlignmentFile.IteratorColumn`. + + """ cdef int ret cdef bam_hdr_t * hdr = self.header.ptr with nogil: @@ -1862,7 +1842,7 @@ cdef class AlignmentFile(HTSFile): def __next__(self): cdef int ret = self.cnext() - if (ret >= 0): + if ret >= 0: return makeAlignedSegment(self.b, self.header) elif ret == -2: raise IOError('truncated file') @@ -1872,75 +1852,78 @@ cdef class AlignmentFile(HTSFile): ########################################### # methods/properties referencing the header def is_valid_tid(self, int tid): - """ - return True if the numerical :term:`tid` is valid; False otherwise. + """True if the numerical :term:`tid` is valid; False otherwise. Note that the unmapped tid code (-1) counts as an invalid. + """ if self.header is None: raise ValueError("header not available in closed files") return self.header.is_valid_tid(tid) def get_tid(self, reference): - """ - return the numerical :term:`tid` corresponding to - :term:`reference` + """numerical :term:`tid` corresponding to :term:`reference`. returns -1 if reference is not known. + """ if self.header is None: raise ValueError("header not available in closed files") return self.header.get_tid(reference) def get_reference_name(self, tid): - """ - return :term:`reference` name corresponding to numerical :term:`tid` - """ + """:term:`reference` name corresponding to numerical :term:`tid`.""" if self.header is None: raise ValueError("header not available in closed files") return self.header.get_reference_name(tid) def get_reference_length(self, reference): - """ - return :term:`reference` name corresponding to numerical :term:`tid` - """ + """:term:`reference` name corresponding to numerical :term:`tid`.""" if self.header is None: raise ValueError("header not available in closed files") return self.header.get_reference_length(reference) - property nreferences: - """int with the number of :term:`reference` sequences in the file. - This is a read-only attribute.""" - def __get__(self): - if self.header: - return self.header.nreferences - else: - raise ValueError("header not available in closed files") - - property references: - """tuple with the names of :term:`reference` sequences. This is a - read-only attribute""" - def __get__(self): - if self.header: - return self.header.references - else: - raise ValueError("header not available in closed files") + @property + def nreferences(self): + """number of :term:`reference` sequences in the file. - property lengths: - """tuple of the lengths of the :term:`reference` sequences. This is a - read-only attribute. The lengths are in the same order as - :attr:`pysam.AlignmentFile.references` + This is a read-only property. """ - def __get__(self): - if self.header: - return self.header.lengths - else: - raise ValueError("header not available in closed files") + if self.header: + return self.header.nreferences + else: + raise ValueError("header not available in closed files") + + @property + def references(self): + """tuple with the names of :term:`reference` sequences. + + This is a read-only property. + + """ + if self.header: + return self.header.references + else: + raise ValueError("header not available in closed files") + + @property + def lengths(self): + """tuple of the lengths of the :term:`reference` sequences. + + The lengths are in the same order as :attr:`pysam.AlignmentFile.references` + + This is a read-only property. + + """ + if self.header: + return self.header.lengths + else: + raise ValueError("header not available in closed files") cdef class IteratorRow: - '''abstract base class for iterators over mapped reads. + """abstract base class for iterators over mapped reads. Various iterators implement different behaviours for wrapping around contig boundaries. Examples include: @@ -1962,8 +1945,7 @@ cdef class IteratorRow: explicitly. It is returned as a result of call to a :meth:`AlignmentFile.fetch`. - ''' - + """ def __init__(self, AlignmentFile samfile, int multiple_iterators=False): cdef char *cfilename cdef char *creference_filename @@ -1985,7 +1967,7 @@ cdef class IteratorRow: self.htsfile = hts_open(cfilename, 'r') assert self.htsfile != NULL - if samfile.has_index(): + if samfile.has_index: if samfile.index_filename: cindexname = samfile.index_filename with nogil: @@ -2028,10 +2010,7 @@ cdef class IteratorRow: cdef class IteratorRowRegion(IteratorRow): - """*(AlignmentFile samfile, int tid, int beg, int stop, - int multiple_iterators=False)* - - iterate over mapped reads in a region. + """iterate over mapped reads in a region. .. note:: @@ -2040,12 +2019,11 @@ cdef class IteratorRowRegion(IteratorRow): :meth:`AlignmentFile.fetch`. """ - def __init__(self, AlignmentFile samfile, int tid, int beg, int stop, int multiple_iterators=False): - if not samfile.has_index(): + if not samfile.has_index: raise ValueError("no index available for iteration") IteratorRow.__init__(self, samfile, @@ -2064,7 +2042,7 @@ cdef class IteratorRowRegion(IteratorRow): return self.b cdef int cnext(self): - '''cversion of iterator. Used by IteratorColumn''' + """cversion of iterator. Used by IteratorColumn""" with nogil: self.retval = hts_itr_next(hts_get_bgzfp(self.htsfile), self.iter, @@ -2090,9 +2068,7 @@ cdef class IteratorRowRegion(IteratorRow): cdef class IteratorRowHead(IteratorRow): - """*(AlignmentFile samfile, n, int multiple_iterators=False)* - - iterate over first n reads in `samfile` + """iterate over first n reads in `samfile` .. note:: It is usually not necessary to create an object of this class @@ -2100,7 +2076,6 @@ cdef class IteratorRowHead(IteratorRow): :meth:`AlignmentFile.head`. """ - def __init__(self, AlignmentFile samfile, int n, @@ -2119,7 +2094,11 @@ cdef class IteratorRowHead(IteratorRow): return self.b cdef int cnext(self): - '''cversion of iterator. Used by IteratorColumn''' + """cversion of iterator. + + Used by IteratorColumn. + + """ cdef int ret cdef bam_hdr_t * hdr = self.header.ptr with nogil: @@ -2143,9 +2122,7 @@ cdef class IteratorRowHead(IteratorRow): cdef class IteratorRowAll(IteratorRow): - """*(AlignmentFile samfile, int multiple_iterators=False)* - - iterate over all reads in `samfile` + """iterate over all reads in `samfile` .. note:: @@ -2154,7 +2131,6 @@ cdef class IteratorRowAll(IteratorRow): :meth:`AlignmentFile.fetch`. """ - def __init__(self, AlignmentFile samfile, int multiple_iterators=False): @@ -2168,7 +2144,7 @@ cdef class IteratorRowAll(IteratorRow): return self.b cdef int cnext(self): - '''cversion of iterator. Used by IteratorColumn''' + """cversion of iterator. Used by IteratorColumn.""" cdef int ret cdef bam_hdr_t * hdr = self.header.ptr with nogil: @@ -2188,8 +2164,7 @@ cdef class IteratorRowAll(IteratorRow): cdef class IteratorRowAllRefs(IteratorRow): - """iterates over all mapped reads by chaining iterators over each - reference + """iterate over all mapped reads by chaining iterators over each reference. .. note:: It is usually not necessary to create an object of this class @@ -2197,14 +2172,13 @@ cdef class IteratorRowAllRefs(IteratorRow): :meth:`AlignmentFile.fetch`. """ - def __init__(self, AlignmentFile samfile, multiple_iterators=False): IteratorRow.__init__(self, samfile, multiple_iterators=multiple_iterators) - if not samfile.has_index(): + if not samfile.has_index: raise ValueError("no index available for fetch") self.tid = -1 @@ -2253,15 +2227,13 @@ cdef class IteratorRowAllRefs(IteratorRow): cdef class IteratorRowSelection(IteratorRow): - """*(AlignmentFile samfile)* - - iterate over reads in `samfile` at a given list of file positions. + """iterate over reads in `samfile` at a given list of file positions. .. note:: It is usually not necessary to create an object of this class explicitly. It is returned as a result of call to a :meth:`AlignmentFile.fetch`. - """ + """ def __init__(self, AlignmentFile samfile, positions, int multiple_iterators=True): IteratorRow.__init__(self, samfile, multiple_iterators=multiple_iterators) @@ -2276,9 +2248,10 @@ cdef class IteratorRowSelection(IteratorRow): return self.b cdef int cnext(self): - '''cversion of iterator''' + """cversion of iterator.""" # end iteration if out of positions - if self.current_pos >= len(self.positions): return -1 + if self.current_pos >= len(self.positions): + return -1 cdef uint64_t pos = self.positions[self.current_pos] with nogil: @@ -2305,28 +2278,23 @@ cdef class IteratorRowSelection(IteratorRow): raise StopIteration -cdef int __advance_nofilter(void *data, bam1_t *b): - '''advance without any read filtering. - ''' +cdef int __advance_nofilter(void *data, bam1_t *b) nogil: + """advance without any read filtering.""" cdef __iterdata * d = <__iterdata*>data - cdef int ret - with nogil: - ret = sam_itr_next(d.htsfile, d.iter, b) - return ret + return sam_itr_next(d.htsfile, d.iter, b) -cdef int __advance_all(void *data, bam1_t *b): - '''only use reads for pileup passing basic filters such as +cdef int __advance_all(void *data, bam1_t *b) nogil: + """only use reads for pileup passing basic filters such as BAM_FUNMAP, BAM_FSECONDARY, BAM_FQCFAIL, BAM_FDUP - ''' + """ cdef __iterdata * d = <__iterdata*>data - cdef mask = BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP + cdef int mask = BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP cdef int ret while 1: - with nogil: - ret = sam_itr_next(d.htsfile, d.iter, b) + ret = sam_itr_next(d.htsfile, d.iter, b) if ret < 0: break if b.core.flag & d.flag_filter: @@ -2335,17 +2303,14 @@ cdef int __advance_all(void *data, bam1_t *b): return ret -cdef int __advance_samtools(void * data, bam1_t * b): - '''advance using same filter and read processing as in - the samtools pileup. - ''' +cdef int __advance_samtools(void * data, bam1_t * b) nogil: + """advance using same filter and read processing as in the samtools pileup.""" cdef __iterdata * d = <__iterdata*>data cdef int ret cdef int q while 1: - with nogil: - ret = sam_itr_next(d.htsfile, d.iter, b) + ret = sam_itr_next(d.htsfile, d.iter, b) if ret < 0: break if b.core.flag & d.flag_filter: @@ -2397,7 +2362,7 @@ cdef int __advance_samtools(void * data, bam1_t * b): cdef class IteratorColumn: - '''abstract base class for iterators over columns. + """abstract base class for iterators over columns. IteratorColumn objects wrap the pileup functionality of samtools. @@ -2426,8 +2391,8 @@ cdef class IteratorColumn: and :meth:`seq_len`. See :class:`~AlignmentFile.pileup` for kwargs to the iterator. - ''' + """ def __cinit__( self, AlignmentFile samfile, **kwargs): self.samfile = samfile self.fastafile = kwargs.get("fastafile", None) @@ -2454,8 +2419,7 @@ cdef class IteratorColumn: return self cdef int cnext(self): - '''perform next iteration. - ''' + """perform next iteration.""" # do not release gil here because of call-backs cdef int ret = bam_mplp_auto(self.pileup_iter, &self.tid, @@ -2465,36 +2429,33 @@ cdef class IteratorColumn: return ret cdef char * get_sequence(self): - '''return current reference sequence underlying the iterator. - ''' + """current reference sequence underlying the iterator.""" return self.iterdata.seq - property seq_len: - '''current sequence length.''' - def __get__(self): - return self.iterdata.seq_len + @property + def seq_len(self): + """current sequence length.""" + return self.iterdata.seq_len def add_reference(self, FastaFile fastafile): - ''' - add reference sequences in `fastafile` to iterator.''' + """add reference sequences in `fastafile` to iterator.""" self.fastafile = fastafile if self.iterdata.seq != NULL: free(self.iterdata.seq) self.iterdata.tid = -1 self.iterdata.fastafile = self.fastafile.fastafile + @property def has_reference(self): - ''' - return true if iterator is associated with a reference''' - return self.fastafile + """True if iterator is associated with a reference.""" + return bool(self.fastafile) cdef _setup_iterator(self, int tid, int start, int stop, int multiple_iterators=0): - '''setup the iterator structure''' - + """setup the iterator structure.""" self.iter = IteratorRowRegion(self.samfile, tid, start, stop, multiple_iterators) self.iterdata.htsfile = self.samfile.htsfile self.iterdata.iter = self.iter.iter @@ -2516,22 +2477,15 @@ cdef class IteratorColumn: if self.stepper is None or self.stepper == "all": with nogil: - self.pileup_iter = bam_mplp_init(1, - &__advance_all, - data) + self.pileup_iter = bam_mplp_init(1, &__advance_all, data) elif self.stepper == "nofilter": with nogil: - self.pileup_iter = bam_mplp_init(1, - &__advance_nofilter, - data) + self.pileup_iter = bam_mplp_init(1, &__advance_nofilter, data) elif self.stepper == "samtools": with nogil: - self.pileup_iter = bam_mplp_init(1, - &__advance_samtools, - data) + self.pileup_iter = bam_mplp_init(1, &__advance_samtools, data) else: - raise ValueError( - "unknown stepper option `%s` in IteratorColumn" % self.stepper) + raise ValueError("unknown stepper option `%s` in IteratorColumn" % self.stepper) if self.max_depth: with nogil: @@ -2542,11 +2496,12 @@ cdef class IteratorColumn: bam_mplp_init_overlaps(self.pileup_iter) cdef reset(self, tid, start, stop): - '''reset iterator position. + """reset iterator position. This permits using the iterator multiple times without having to incur the full set-up costs. - ''' + + """ self.iter = IteratorRowRegion(self.samfile, tid, start, stop, multiple_iterators=0) self.iterdata.iter = self.iter.iter @@ -2564,10 +2519,12 @@ cdef class IteratorColumn: bam_mplp_reset(self.pileup_iter) cdef _free_pileup_iter(self): - '''free the memory alloc'd by bam_plp_init. + """free the memory alloc'd by bam_plp_init. This is needed before setup_iterator allocates another - pileup_iter, or else memory will be lost. ''' + pileup_iter, or else memory will be lost. + + """ if self.pileup_iter != NULL: with nogil: bam_mplp_reset(self.pileup_iter) @@ -2584,19 +2541,9 @@ cdef class IteratorColumn: free(self.iterdata.seq) self.iterdata.seq = NULL - # backwards compatibility - - def hasReference(self): - return self.has_reference() - cdef char * getSequence(self): - return self.get_sequence() - def addReference(self, FastaFile fastafile): - return self.add_reference(fastafile) - cdef class IteratorColumnRegion(IteratorColumn): - '''iterates over a region only. - ''' + """iterates over a region only.""" def __cinit__(self, AlignmentFile samfile, int tid = 0, @@ -2645,9 +2592,7 @@ cdef class IteratorColumnRegion(IteratorColumn): cdef class IteratorColumnAllRefs(IteratorColumn): - """iterates over all columns by chaining iterators over each reference - """ - + """iterates over all columns by chaining iterators over each reference.""" def __cinit__(self, AlignmentFile samfile, **kwargs): @@ -2687,7 +2632,7 @@ cdef class IteratorColumnAllRefs(IteratorColumn): cdef class SNPCall: - '''the results of a SNP call.''' + """results of a SNP call.""" cdef int _tid cdef int _pos cdef char _reference_base @@ -2697,59 +2642,61 @@ cdef class SNPCall: cdef int _rms_mapping_quality cdef int _coverage - property tid: - '''the chromosome ID as is defined in the header''' - def __get__(self): - return self._tid - - property pos: - '''nucleotide position of SNP.''' - def __get__(self): return self._pos - - property reference_base: - '''reference base at pos. ``N`` if no reference sequence supplied.''' - def __get__(self): return from_string_and_size( &self._reference_base, 1 ) - - property genotype: - '''the genotype called.''' - def __get__(self): return from_string_and_size( &self._genotype, 1 ) - - property consensus_quality: - '''the genotype quality (Phred-scaled).''' - def __get__(self): return self._consensus_quality - - property snp_quality: - '''the snp quality (Phred scaled) - probability of consensus being - identical to reference sequence.''' - def __get__(self): return self._snp_quality - - property mapping_quality: - '''the root mean square (rms) of the mapping quality of all reads - involved in the call.''' - def __get__(self): return self._rms_mapping_quality - - property coverage: - '''coverage or read depth - the number of reads involved in the call.''' - def __get__(self): return self._coverage + @property + def tid(self): + """chromosome ID as is defined in the header.""" + return self._tid + + @property + def pos(self): + """nucleotide position of SNP.""" + return self._pos + + @property + def reference_base(self): + """reference base at pos. ``N`` if no reference sequence supplied.""" + return from_string_and_size(&self._reference_base, 1) + + @property + def genotype(self): + """genotype called.""" + return from_string_and_size(&self._genotype, 1) + + @property + def consensus_quality(self): + """the genotype quality (Phred-scaled).""" + return self._consensus_quality + + @property + def snp_quality(self): + """snp quality (Phred scaled) - probability of consensus being identical to reference sequence.""" + return self._snp_quality + + @property + def mapping_quality(self): + """the root mean square (rms) of the mapping quality of all reads involved in the call.""" + return self._rms_mapping_quality + + @property + def coverage(self): + """coverage or read depth - the number of reads involved in the call.""" + return self._coverage def __str__(self): - - return "\t".join( map(str, ( - self.tid, - self.pos, - self.reference_base, - self.genotype, - self.consensus_quality, - self.snp_quality, - self.mapping_quality, - self.coverage ) ) ) + return "\t".join(map(str, ( + self.tid, + self.pos, + self.reference_base, + self.genotype, + self.consensus_quality, + self.snp_quality, + self.mapping_quality, + self.coverage)) + ) cdef class IndexedReads: - """*(AlignmentFile samfile, multiple_iterators=True) - - Index a Sam/BAM-file by query name while keeping the - original sort order intact. + """Index a Sam/BAM-file by query name while keeping the original sort order intact. The index is kept in memory and can be substantial. @@ -2768,7 +2715,6 @@ cdef class IndexedReads: existing iterators being affected by the indexing. """ - def __init__(self, AlignmentFile samfile, int multiple_iterators=True): cdef char *cfilename @@ -2801,8 +2747,7 @@ cdef class IndexedReads: self.owns_samfile = False def build(self): - '''build the index.''' - + """build the index.""" self.index = collections.defaultdict(list) # this method will start indexing from the current file position @@ -2828,7 +2773,7 @@ cdef class IndexedReads: bam_destroy1(b) def find(self, query_name): - '''find `query_name` in index. + """find `query_name` in index. Returns ------- @@ -2842,15 +2787,15 @@ cdef class IndexedReads: KeyError if the `query_name` is not in the index. - ''' - if query_name in self.index: - return IteratorRowSelection( - self.samfile, - self.index[query_name], - multiple_iterators = False) - else: + """ + if query_name not in self.index: raise KeyError("read %s not found" % query_name) + return IteratorRowSelection( + self.samfile, + self.index[query_name], + multiple_iterators = False) + def __dealloc__(self): if self.owns_samfile: hts_close(self.htsfile) diff --git a/pysam/libcbcf.pxd b/pysam/libcbcf.pxd index 1d4129b..8f2f676 100644 --- a/pysam/libcbcf.pxd +++ b/pysam/libcbcf.pxd @@ -1,4 +1,4 @@ -############################################################################### +# cython: language_level=3, embedsignature=True, profile=True ############################################################################### ## Cython wrapper for htslib VCF/BCF reader/writer ############################################################################### diff --git a/pysam/libcbcf.pyx b/pysam/libcbcf.pyx index cc5ee99..46056c5 100644 --- a/pysam/libcbcf.pyx +++ b/pysam/libcbcf.pyx @@ -1,6 +1,4 @@ -# cython: embedsignature=True -# cython: profile=True -############################################################################### +# cython: language_level=3, embedsignature=True, profile=True ############################################################################### ## Cython wrapper for htslib VCF/BCF reader/writer ############################################################################### @@ -2119,6 +2117,7 @@ cdef class VariantHeader(object): if self.ptr.dirty: bcf_hdr_sync(self.ptr) + def add_meta(self, key, value=None, items=None): """Add metadata to this header""" if not ((value is not None) ^ (items is not None)): @@ -2135,11 +2134,16 @@ cdef class VariantHeader(object): hrec.value = strdup(force_bytes(value)) else: for key, value in items: + + quoted = True + if key in set(("ID", "Number", "Type")): + quoted = False + key = force_bytes(key) bcf_hrec_add_key(hrec, key, len(key)) value = force_bytes(str(value)) - quoted = strpbrk(value, ' ;,"\t<>') != NULL + bcf_hrec_set_val(hrec, hrec.nkeys-1, value, len(value), quoted) except: bcf_hrec_destroy(hrec) @@ -2984,7 +2988,10 @@ cdef class VariantRecord(object): def copy(self): """return a copy of this VariantRecord object""" - return makeVariantRecord(self.header, bcf_dup(self.ptr)) + cdef bcf1_t r = bcf_dup(self.ptr) + if bcf_unpack(r, BCF_UN_ALL) < 0: + raise ValueError('Error unpacking VariantRecord') + return makeVariantRecord(self.header, r) def translate(self, VariantHeader dst_header): if dst_header is None: @@ -4446,3 +4453,4 @@ cdef class VariantFile(HTSFile): # potentially unnecessary optimization that also sets max_unpack if not include_samples: self.drop_samples = True + diff --git a/pysam/libcbgzf.pyx b/pysam/libcbgzf.pyx index 02ff2a2..fd1d881 100644 --- a/pysam/libcbgzf.pyx +++ b/pysam/libcbgzf.pyx @@ -1,3 +1,5 @@ +# cython: language_level=3, embedsignature=True, profile=True + """Functions that read and write block gzipped files. The user of the file doesn't have to worry about the compression @@ -25,7 +27,7 @@ __all__ = ["BGZFile"] BUFFER_SIZE = io.DEFAULT_BUFFER_SIZE -cdef class BGZFile(object): +cdef class BGZFile(io.IOBase): """The BGZFile class simulates most of the methods of a file object with the exception of the truncate() method. @@ -162,8 +164,7 @@ cdef class BGZFile(object): raise AttributeError('fileno') def rewind(self): - '''Return the uncompressed stream file position indicator to the - beginning of the file''' + """Return the uncompressed stream file position indicator to the beginning of the file""" if not self.bgzf: raise ValueError("rewind() on closed BGZFile object") if not self.bgzf.is_write: @@ -213,7 +214,7 @@ cdef class BGZFile(object): line.l = line.m = 0 line.s = NULL - cdef int ret = bgzf_getline(self.bgzf, '\n', &line) + cdef int ret = bgzf_getline(self.bgzf, b'\n', &line) if ret == -1: s = b'' elif ret == -2: diff --git a/pysam/libcfaidx.pxd b/pysam/libcfaidx.pxd index 6dda60f..98f153e 100644 --- a/pysam/libcfaidx.pxd +++ b/pysam/libcfaidx.pxd @@ -1,3 +1,5 @@ +# cython: language_level=3, embedsignature=True, profile=True + from libc.stdint cimport int8_t, int16_t, int32_t, int64_t from libc.stdint cimport uint8_t, uint16_t, uint32_t, uint64_t from libc.stdlib cimport malloc, calloc, realloc, free diff --git a/pysam/libcfaidx.pyx b/pysam/libcfaidx.pyx index a29c7fc..7fb17ec 100644 --- a/pysam/libcfaidx.pyx +++ b/pysam/libcfaidx.pyx @@ -1,6 +1,4 @@ -# cython: embedsignature=True -# cython: profile=True -############################################################################### +# cython: language_level=3, embedsignature=True, profile=True ############################################################################### # Cython wrapper for SAM/BAM/CRAM files based on htslib ############################################################################### @@ -67,7 +65,7 @@ from pysam.libcutils cimport qualitystring_to_array, parse_region cdef class FastqProxy cdef makeFastqProxy(kseq_t * src): - '''enter src into AlignedRead.''' + """enter src into AlignedRead.""" cdef FastqProxy dest = FastqProxy.__new__(FastqProxy) dest._delegate = src return dest @@ -76,8 +74,7 @@ cdef makeFastqProxy(kseq_t * src): ## add automatic indexing. ## add function to get sequence names. cdef class FastaFile: - """Random access to fasta formatted files that - have been indexed by :term:`faidx`. + """Random access to fasta formatted files that have been indexed by :term:`faidx`. The file is automatically opened. The index file of file ```` is expected to be called ``.fai``. @@ -106,7 +103,6 @@ cdef class FastaFile: if file could not be opened """ - def __cinit__(self, *args, **kwargs): self.fastafile = NULL self._filename = None @@ -116,7 +112,7 @@ cdef class FastaFile: self._open(*args, **kwargs) def is_open(self): - '''return true if samfile has been opened.''' + """return true if samfile has been opened.""" return self.fastafile != NULL def __len__(self): @@ -126,11 +122,11 @@ cdef class FastaFile: return faidx_nseq(self.fastafile) def _open(self, filename, filepath_index=None, filepath_index_compressed=None): - '''open an indexed fasta file. + """open an indexed fasta file. This method expects an indexed fasta file. - ''' + """ # close a previously opened file if self.fastafile != NULL: self.close() @@ -140,7 +136,7 @@ cdef class FastaFile: cdef char *cindexname = NULL cdef char *cindexname_compressed = NULL self.is_remote = hisremote(cfilename) - + # open file for reading if (self._filename != b"-" and not self.is_remote @@ -158,7 +154,7 @@ cdef class FastaFile: if not os.path.exists(filepath_index): raise IOError("filename {} does not exist".format(filepath_index)) cindexname = bindex_filename = encode_filename(filepath_index) - + if filepath_index_compressed: if not os.path.exists(filepath_index_compressed): raise IOError("filename {} does not exist".format(filepath_index_compressed)) @@ -206,33 +202,38 @@ cdef class FastaFile: self.close() return False - property closed: + @property + def closed(self): """bool indicating the current state of the file object. + This is a read-only attribute; the close() method changes the value. + """ - def __get__(self): - return not self.is_open() + return not self.is_open() - property filename: + @property + def filename(self): """filename associated with this object. This is a read-only attribute.""" - def __get__(self): - return self._filename + return self._filename + + @property + def references(self): + """tuple with the names of :term:`reference` sequences.""" + return self._references - property references: - '''tuple with the names of :term:`reference` sequences.''' - def __get__(self): - return self._references + @property + def nreferences(self): + """number of :term:`reference` sequences in the file. - property nreferences: - """int with the number of :term:`reference` sequences in the file. - This is a read-only attribute.""" - def __get__(self): - return len(self._references) if self.references else None + This is a read-only attribute. - property lengths: + """ + return len(self._references) if self.references else None + + @property + def lengths(self): """tuple with the lengths of :term:`reference` sequences.""" - def __get__(self): - return self._lengths + return self._lengths def fetch(self, reference=None, @@ -271,9 +272,8 @@ cdef class FastaFile: if the region is invalid """ - if not self.is_open(): - raise ValueError("I/O operation on closed file" ) + raise ValueError("I/O operation on closed file") cdef int length cdef char *seq @@ -317,8 +317,7 @@ cdef class FastaFile: free(seq) cdef char *_fetch(self, char *reference, int start, int end, int *length) except? NULL: - '''fetch sequence for reference, start and end''' - + """fetch sequence for reference, start and end.""" cdef char *seq with nogil: seq = faidx_fetch_seq(self.fastafile, @@ -336,45 +335,41 @@ cdef class FastaFile: return seq def get_reference_length(self, reference): - '''return the length of reference.''' + """length of reference.""" return self.reference2length[reference] def __getitem__(self, reference): return self.fetch(reference) def __contains__(self, reference): - '''return true if reference in fasta file.''' + """True if reference in fasta file.""" return reference in self.reference2length cdef class FastqProxy: """A single entry in a fastq file.""" - def __init__(self): pass - - property name: + @property + def name(self): """The name of each entry in the fastq file.""" - def __get__(self): - return charptr_to_str(self._delegate.name.s) + return charptr_to_str(self._delegate.name.s) - property sequence: + @property + def sequence(self): """The sequence of each entry in the fastq file.""" - def __get__(self): - return charptr_to_str(self._delegate.seq.s) + return charptr_to_str(self._delegate.seq.s) - property comment: - def __get__(self): - if self._delegate.comment.l: - return charptr_to_str(self._delegate.comment.s) - else: - return None + @property + def comment(self): + if not self._delegate.comment.l: + return None + return charptr_to_str(self._delegate.comment.s) - property quality: + @property + def quality(self): """The quality score of each entry in the fastq file, represented as a string.""" - def __get__(self): - if self._delegate.qual.l: - return charptr_to_str(self._delegate.qual.s) - else: - return None + if not self._delegate.qual.l: + return None + return charptr_to_str(self._delegate.qual.s) cdef cython.str to_string(self): if self.comment is None: @@ -387,12 +382,12 @@ cdef class FastqProxy: else: return "@%s%s\n%s\n+\n%s" % (self.name, comment, self.sequence, self.quality) - + def __str__(self): return self.to_string() cpdef array.array get_quality_array(self, int offset=33): - '''return quality values as integer array after subtracting offset.''' + """return quality values as integer array after subtracting offset.""" if self.quality is None: return None return qualitystring_to_array(force_bytes(self.quality), @@ -434,7 +429,7 @@ cdef class FastxRecord: if self.sequence is None: raise ValueError("can not write record without a sequence") - + if self.comment is None: comment = "" else: @@ -445,19 +440,18 @@ cdef class FastxRecord: else: return "@%s%s\n%s\n+\n%s" % (self.name, comment, self.sequence, self.quality) - + + # FIXME: upgrade into properties def set_name(self, name): if name is None: raise ValueError("FastxRecord must have a name and not None") self.name = name def set_comment(self, comment): - self.comment = comment - - def set_sequence(self, sequence, quality=None): - """set sequence of this record. + self.comment = comment - """ + def set_sequence(self, sequence, quality=None): + """set sequence of this record.""" self.sequence = sequence if quality is not None: if len(sequence) != len(quality): @@ -472,7 +466,7 @@ cdef class FastxRecord: return self.to_string() cpdef array.array get_quality_array(self, int offset=33): - '''return quality values as array after subtracting offset.''' + """return quality values as array after subtracting offset.""" if self.quality is None: return None return qualitystring_to_array(force_bytes(self.quality), @@ -530,12 +524,13 @@ cdef class FastxFile: self.entry = NULL self._open(*args, **kwargs) + @property def is_open(self): - '''return true if samfile has been opened.''' + """True if samfile has been opened.""" return self.entry != NULL def _open(self, filename, persist=True): - '''open a fastq/fasta file in *filename* + """open a fastq/fasta file in *filename* Paramentes ---------- @@ -546,7 +541,7 @@ cdef class FastxFile: True). The copy will persist even if the iteration on the file continues. - ''' + """ if self.fastqfile != NULL: self.close() @@ -568,7 +563,7 @@ cdef class FastxFile: self._filename = filename def close(self): - '''close the file.''' + """close the file.""" if self.fastqfile != NULL: bgzf_close(self.fastqfile) self.fastqfile = NULL @@ -590,20 +585,22 @@ cdef class FastxFile: self.close() return False - property closed: + @property + def closed(self): """bool indicating the current state of the file object. + This is a read-only attribute; the close() method changes the value. + """ - def __get__(self): - return not self.is_open() + return not self.is_open - property filename: + @property + def filename(self): """string with the filename associated with this object.""" - def __get__(self): - return self._filename + return self._filename def __iter__(self): - if not self.is_open(): + if not self.is_open: raise ValueError("I/O operation on closed file") return self @@ -611,15 +608,12 @@ cdef class FastxFile: return self.entry cdef int cnext(self): - '''C version of iterator - ''' + """C version of iterator.""" with nogil: return kseq_read(self.entry) def __next__(self): - """ - python version of next(). - """ + """python version of next().""" cdef int l with nogil: l = kseq_read(self.entry) diff --git a/pysam/libchtslib.pxd b/pysam/libchtslib.pxd index c7d4c35..fc8cd3c 100644 --- a/pysam/libchtslib.pxd +++ b/pysam/libchtslib.pxd @@ -1,3 +1,5 @@ +# cython: language_level=3, embedsignature=True, profile=True + from libc.stdint cimport int8_t, int16_t, int32_t, int64_t from libc.stdint cimport uint8_t, uint16_t, uint32_t, uint64_t from libc.stdlib cimport malloc, calloc, realloc, free diff --git a/pysam/libchtslib.pyx b/pysam/libchtslib.pyx index b4dcaa8..f66e53f 100644 --- a/pysam/libchtslib.pyx +++ b/pysam/libchtslib.pyx @@ -1,5 +1,5 @@ -# cython: embedsignature=True -# cython: profile=True +# cython: language_level=3, embedsignature=True, profile=True + # adds doc-strings for sphinx ######################################################################## @@ -68,7 +68,7 @@ cpdef get_verbosity(): ## HFile wrapper class ######################################################################## -cdef class HFile(object): +cdef class HFile(io.IOBase): cdef hFILE *fp cdef readonly object name, mode @@ -292,41 +292,15 @@ cdef class HFile(object): self.write(line) -######################################################################## -######################################################################## -## Helpers for backward compatibility to hide the difference between -## boolean properties and methods -######################################################################## - -class CallableValue(object): - def __init__(self, value): - self.value = value - def __call__(self): - return self.value - def __bool__(self): - return self.value - def __nonzero__(self): - return self.value - def __eq__(self, other): - return self.value == other - def __ne__(self, other): - return self.value != other - - -CTrue = CallableValue(True) -CFalse = CallableValue(False) - - ######################################################################## ######################################################################## ## HTSFile wrapper class (base class for AlignmentFile and VariantFile) ######################################################################## -cdef class HTSFile(object): +cdef class HTSFile(io.IOBase): """ Base class for HTS file types """ - def __cinit__(self, *args, **kwargs): self.htsfile = NULL self.threads = 1 @@ -417,56 +391,21 @@ cdef class HTSFile(object): finally: free(desc) - @property - def is_open(self): - """return True if HTSFile is open and in a valid state.""" - return CTrue if self.htsfile != NULL else CFalse - - @property - def is_closed(self): - """return True if HTSFile is closed.""" - return self.htsfile == NULL - @property def closed(self): """return True if HTSFile is closed.""" return self.htsfile == NULL @property - def is_write(self): + def writable(self): """return True if HTSFile is open for writing""" return self.htsfile != NULL and self.htsfile.is_write != 0 @property - def is_read(self): + def readable(self): """return True if HTSFile is open for reading""" return self.htsfile != NULL and self.htsfile.is_write == 0 - @property - def is_sam(self): - """return True if HTSFile is reading or writing a SAM alignment file""" - return self.htsfile != NULL and self.htsfile.format.format == sam - - @property - def is_bam(self): - """return True if HTSFile is reading or writing a BAM alignment file""" - return self.htsfile != NULL and self.htsfile.format.format == bam - - @property - def is_cram(self): - """return True if HTSFile is reading or writing a BAM alignment file""" - return self.htsfile != NULL and self.htsfile.format.format == cram - - @property - def is_vcf(self): - """return True if HTSFile is reading or writing a VCF variant file""" - return self.htsfile != NULL and self.htsfile.format.format == vcf - - @property - def is_bcf(self): - """return True if HTSFile is reading or writing a BCF variant file""" - return self.htsfile != NULL and self.htsfile.format.format == bcf - def reset(self): """reset file position to beginning of file just after the header. @@ -533,42 +472,42 @@ cdef class HTSFile(object): if htsfile != NULL: hts_set_threads(htsfile, threads) return htsfile - else: - if isinstance(self.filename, int): - fd = self.filename - else: - fd = self.filename.fileno() - if self.duplicate_filehandle: - dup_fd = dup(fd) - else: - dup_fd = fd + if isinstance(self.filename, int): + fd = self.filename + else: + fd = self.filename.fileno() - # Replicate mode normalization done in hts_open_format - smode = self.mode.replace(b'b', b'').replace(b'c', b'') - if b'b' in self.mode: - smode += b'b' - elif b'c' in self.mode: - smode += b'c' - cmode = smode + if self.duplicate_filehandle: + dup_fd = dup(fd) + else: + dup_fd = fd - hfile = hdopen(dup_fd, cmode) - if hfile == NULL: - raise IOError('Cannot create hfile') + # Replicate mode normalization done in hts_open_format + smode = self.mode.replace(b'b', b'').replace(b'c', b'') + if b'b' in self.mode: + smode += b'b' + elif b'c' in self.mode: + smode += b'c' + cmode = smode - try: - # filename.name can be an int - filename = str(self.filename.name) - except AttributeError: - filename = ''.format(fd) + hfile = hdopen(dup_fd, cmode) + if hfile == NULL: + raise IOError('Cannot create hfile') - filename = encode_filename(filename) - cfilename = filename - with nogil: - htsfile = hts_hopen(hfile, cfilename, cmode) - if htsfile != NULL: - hts_set_threads(htsfile, threads) - return htsfile + try: + # filename.name can be an int + filename = str(self.filename.name) + except AttributeError: + filename = ''.format(fd) + + filename = encode_filename(filename) + cfilename = filename + with nogil: + htsfile = hts_hopen(hfile, cfilename, cmode) + if htsfile != NULL: + hts_set_threads(htsfile, threads) + return htsfile def add_hts_options(self, format_options=None): """Given a list of key=value format option strings, add them to an open htsFile diff --git a/pysam/libctabix.pxd b/pysam/libctabix.pxd index c986f03..6ade269 100644 --- a/pysam/libctabix.pxd +++ b/pysam/libctabix.pxd @@ -1,3 +1,5 @@ +# cython: language_level=3, embedsignature=True, profile=True + from libc.stdint cimport int8_t, int16_t, int32_t, int64_t from libc.stdint cimport uint8_t, uint16_t, uint32_t, uint64_t from libc.stdlib cimport malloc, calloc, realloc, free @@ -120,8 +122,3 @@ cdef class GZIteratorHead(GZIterator): cdef class GZIteratorParsed(GZIterator): cdef Parser parser - - -# Compatibility Layer for pysam < 0.8 -cdef class Tabixfile(TabixFile): - pass diff --git a/pysam/libctabix.pyx b/pysam/libctabix.pyx index 840161a..ec9f929 100644 --- a/pysam/libctabix.pyx +++ b/pysam/libctabix.pyx @@ -1,6 +1,4 @@ -# cython: embedsignature=True -# cython: profile=True -############################################################################### +# cython: language_level=3, embedsignature=True, profile=True ############################################################################### # Cython wrapper for access to tabix indexed files in bgzf format ############################################################################### @@ -24,10 +22,6 @@ # class TabixIterator iterator class over rows in bgzf file # class EmptyIterator # -# For backwards compatibility, the following classes are also defined: -# -# class Tabixfile equivalent to TabixFile -# ############################################################################### # # The MIT License @@ -99,10 +93,10 @@ cdef class Parser: cdef class asTuple(Parser): - '''converts a :term:`tabix row` into a python tuple. + """converts a :term:`tabix row` into a python tuple. A field in a row is accessed by numeric index. - ''' + """ cdef parse(self, char * buffer, int len): cdef ctabixproxies.TupleProxy r r = ctabixproxies.TupleProxy(self.encoding) @@ -113,9 +107,9 @@ cdef class asTuple(Parser): cdef class asGFF3(Parser): - '''converts a :term:`tabix row` into a GFF record with the following + """converts a :term:`tabix row` into a GFF record with the following fields: - + +----------+----------+-------------------------------+ |*Column* |*Name* |*Content* | +----------+----------+-------------------------------+ @@ -140,7 +134,7 @@ cdef class asGFF3(Parser): |9 |attributes|the attribute field | +----------+----------+-------------------------------+ - ''' + """ cdef parse(self, char * buffer, int len): cdef ctabixproxies.GFF3Proxy r r = ctabixproxies.GFF3Proxy(self.encoding) @@ -149,9 +143,9 @@ cdef class asGFF3(Parser): cdef class asGTF(Parser): - '''converts a :term:`tabix row` into a GTF record with the following + """converts a :term:`tabix row` into a GTF record with the following fields: - + +----------+----------+-------------------------------+ |*Column* |*Name* |*Content* | +----------+----------+-------------------------------+ @@ -187,16 +181,16 @@ cdef class asGTF(Parser): |transcript_id |the transcript identifier | +--------------------+------------------------------+ - ''' + """ cdef parse(self, char * buffer, int len): cdef ctabixproxies.GTFProxy r r = ctabixproxies.GTFProxy(self.encoding) r.copy(buffer, len) return r - + cdef class asBed(Parser): - '''converts a :term:`tabix row` into a bed record + """converts a :term:`tabix row` into a bed record with the following fields: +-----------+-----------+------------------------------------------+ @@ -235,7 +229,7 @@ cdef class asBed(Parser): fields are optional, but if one is defined, all the preceding need to be defined as well. - ''' + """ cdef parse(self, char * buffer, int len): cdef ctabixproxies.BedProxy r r = ctabixproxies.BedProxy(self.encoding) @@ -243,10 +237,10 @@ cdef class asBed(Parser): return r -cdef class asVCF(Parser): - '''converts a :term:`tabix row` into a VCF record with +cdef class asVCF(Parser): + """converts a :term:`tabix row` into a VCF record with the following fields: - + +----------+---------+------------------------------------+ |*Column* |*Field* |*Contents* | | | | | @@ -276,7 +270,7 @@ cdef class asVCF(Parser): first_sample_genotype = vcf[0] second_sample_genotype = vcf[1] - ''' + """ cdef parse(self, char * buffer, int len): cdef ctabixproxies.VCFProxy r r = ctabixproxies.VCFProxy(self.encoding) @@ -291,10 +285,10 @@ cdef class TabixFile: The file is automatically opened. The index file of file ```` is expected to be called ``.tbi`` by default (see parameter `index`). - + Parameters ---------- - + filename : string Filename of bgzf file to be opened. @@ -304,9 +298,9 @@ cdef class TabixFile: mode : char The file opening mode. Currently, only ``r`` is permitted. - + parser : :class:`pysam.Parser` - + sets the default parser for this tabix file. If `parser` is None, the results are returned as an unparsed string. Otherwise, `parser` is assumed to be a functor that will return @@ -324,7 +318,7 @@ cdef class TabixFile: Raises ------ - + ValueError if index file is missing. @@ -355,7 +349,7 @@ cdef class TabixFile: index=None, threads=1, ): - '''open a :term:`tabix file` for reading.''' + """open a :term:`tabix file` for reading.""" if mode != 'r': raise ValueError("invalid file opening mode `%s`" % mode) @@ -388,7 +382,7 @@ cdef class TabixFile: if self.htsfile == NULL: raise IOError("could not open file `%s`" % filename) - + #if self.htsfile.format.category != region_list: # raise ValueError("file does not contain region data") @@ -402,10 +396,10 @@ cdef class TabixFile: self.start_offset = self.tell() def _dup(self): - '''return a copy of this tabix file. - + """return a copy of this tabix file. + The file is being re-opened. - ''' + """ return TabixFile(self.filename, mode="r", threads=self.threads, @@ -413,33 +407,33 @@ cdef class TabixFile: index=self.filename_index, encoding=self.encoding) - def fetch(self, + def fetch(self, reference=None, - start=None, - end=None, + start=None, + end=None, region=None, parser=None, multiple_iterators=False): - '''fetch one or more rows in a :term:`region` using 0-based + """fetch one or more rows in a :term:`region` using 0-based indexing. The region is specified by :term:`reference`, *start* and *end*. Alternatively, a samtools :term:`region` string can be supplied. - Without *reference* or *region* all entries will be fetched. - + Without *reference* or *region* all entries will be fetched. + If only *reference* is set, all reads matching on *reference* will be fetched. If *parser* is None, the default parser will be used for parsing. - + Set *multiple_iterators* to true if you will be using multiple iterators on the same file at the same time. The iterator returned will receive its own copy of a filehandle to the file effectively re-opening the file. Re-opening a file creates some overhead, so beware. - ''' + """ if not self.is_open(): raise ValueError("I/O operation on closed file") @@ -450,7 +444,7 @@ cdef class TabixFile: raise ValueError("end out of range (%i)" % end) if start is None: start = 0 - + if start < 0: raise ValueError("start out of range (%i)" % end) elif start > end: @@ -507,13 +501,13 @@ cdef class TabixFile: raise ValueError( "could not create iterator for region '%s'" % region) - + # use default parser if no parser is specified if parser is None: parser = fileobj.parser cdef TabixIterator a - if parser is None: + if parser is None: a = TabixIterator(encoding=fileobj.encoding) else: parser.set_encoding(fileobj.encoding) @@ -529,75 +523,72 @@ cdef class TabixFile: ############################################################### ## properties ############################################################### - property header: - '''the file header. + @property + def header(self): + """the file header. The file header consists of the lines at the beginning of a file that are prefixed by the comment character ``#``. - + .. note:: The header is returned as an iterator presenting lines without the newline character. - ''' - - def __get__(self): - - cdef char *cfilename = self.filename - cdef char *cfilename_index = self.filename_index - - cdef kstring_t buffer - buffer.l = buffer.m = 0 - buffer.s = NULL - - cdef htsFile * fp = NULL - cdef int KS_SEP_LINE = 2 - cdef tbx_t * tbx = NULL - lines = [] - with nogil: - fp = hts_open(cfilename, 'r') - - if fp == NULL: - raise OSError("could not open {} for reading header".format(self.filename)) + """ + cdef char *cfilename = self.filename + cdef char *cfilename_index = self.filename_index - with nogil: - tbx = tbx_index_load2(cfilename, cfilename_index) - - if tbx == NULL: - raise OSError("could not load .tbi/.csi index of {}".format(self.filename)) + cdef kstring_t buffer + buffer.l = buffer.m = 0 + buffer.s = NULL - while hts_getline(fp, KS_SEP_LINE, &buffer) >= 0: - if not buffer.l or buffer.s[0] != tbx.conf.meta_char: - break - lines.append(force_str(buffer.s, self.encoding)) + cdef htsFile * fp = NULL + cdef int KS_SEP_LINE = 2 + cdef tbx_t * tbx = NULL + lines = [] + with nogil: + fp = hts_open(cfilename, 'r') - with nogil: - hts_close(fp) - free(buffer.s) + if fp == NULL: + raise OSError("could not open {} for reading header".format(self.filename)) + + with nogil: + tbx = tbx_index_load2(cfilename, cfilename_index) - return lines + if tbx == NULL: + raise OSError("could not load .tbi/.csi index of {}".format(self.filename)) + + while hts_getline(fp, KS_SEP_LINE, &buffer) >= 0: + if not buffer.l or buffer.s[0] != tbx.conf.meta_char: + break + lines.append(force_str(buffer.s, self.encoding)) + + with nogil: + hts_close(fp) + free(buffer.s) + + return lines + + @property + def contigs(self): + """list of chromosome names""" + cdef char ** sequences + cdef int nsequences + + with nogil: + sequences = tbx_seqnames(self.index, &nsequences) + cdef int x + result = [] + for x from 0 <= x < nsequences: + result.append(force_str(sequences[x])) + + # htslib instructions: + # only free container, not the sequences themselves + free(sequences) + + return result - property contigs: - '''list of chromosome names''' - def __get__(self): - cdef char ** sequences - cdef int nsequences - - with nogil: - sequences = tbx_seqnames(self.index, &nsequences) - cdef int x - result = [] - for x from 0 <= x < nsequences: - result.append(force_str(sequences[x])) - - # htslib instructions: - # only free container, not the sequences themselves - free(sequences) - - return result - def close(self): - ''' - closes the :class:`pysam.TabixFile`.''' + """closes the :class:`pysam.TabixFile`.""" if self.htsfile != NULL: hts_close(self.htsfile) self.htsfile = NULL @@ -623,20 +614,20 @@ cdef class TabixIterator: def __init__(self, encoding="ascii"): self.encoding = encoding - + def __iter__(self): self.buffer.s = NULL self.buffer.l = 0 self.buffer.m = 0 - return self + return self cdef int __cnext__(self): - '''iterate to next element. - + """iterate to next element. + Return -5 if file has been closed when this function was called. - ''' + """ if self.tabixfile.htsfile == NULL: return -5 @@ -653,17 +644,17 @@ cdef class TabixIterator: if retval < 0: break - if self.buffer.s[0] != '#': + if self.buffer.s[0] != b'#': break return retval - def __next__(self): + def __next__(self): """python version of next(). pyrex uses this non-standard name instead of next() """ - + cdef int retval = self.__cnext__() if retval == -5: raise IOError("iteration on closed file") @@ -672,9 +663,6 @@ cdef class TabixIterator: return charptr_to_str(self.buffer.s, self.encoding) - def next(self): - return self.__next__() - def __dealloc__(self): if self.iterator != NULL: tbx_itr_destroy(self.iterator) @@ -683,14 +671,11 @@ cdef class TabixIterator: class EmptyIterator: - '''empty iterator''' + """empty iterator""" def __iter__(self): return self - def next(self): - raise StopIteration() - def __next__(self): raise StopIteration() @@ -703,18 +688,18 @@ cdef class TabixIteratorParsed(TabixIterator): Returns parsed data. """ - def __init__(self, + def __init__(self, Parser parser): - + TabixIterator.__init__(self) self.parser = parser - def __next__(self): + def __next__(self): """python version of next(). pyrex uses this non-standard name instead of next() """ - + cdef int retval = self.__cnext__() if retval == -5: raise IOError("iteration on closed file") @@ -727,9 +712,9 @@ cdef class TabixIteratorParsed(TabixIterator): cdef class GZIterator: def __init__(self, filename, int buffer_size=65536, encoding="ascii"): - '''iterate line-by-line through gzip (or bgzip) + """iterate line-by-line through gzip (or bgzip) compressed file. - ''' + """ if not os.path.exists(filename): raise IOError("No such file or directory: %s" % filename) @@ -746,7 +731,7 @@ cdef class GZIterator: self.buffer.s = malloc(buffer_size) def __dealloc__(self): - '''close file.''' + """close file.""" if self.gzipfile != NULL: bgzf_close(self.gzipfile) self.gzipfile = NULL @@ -763,9 +748,9 @@ cdef class GZIterator: cdef int retval = 0 while 1: with nogil: - retval = ks_getuntil(self.kstream, '\n', &self.buffer, &dret) - - if retval < 0: + retval = ks_getuntil(self.kstream, b'\n', &self.buffer, &dret) + + if retval < 0: break return dret @@ -781,9 +766,9 @@ cdef class GZIterator: cdef class GZIteratorHead(GZIterator): - '''iterate line-by-line through gzip (or bgzip) + """iterate line-by-line through gzip (or bgzip) compressed file returning comments at top of file. - ''' + """ def __next__(self): """python version of next(). @@ -791,16 +776,16 @@ cdef class GZIteratorHead(GZIterator): cdef int retval = self.__cnext__() if retval < 0: raise StopIteration - if self.buffer.s[0] == '#': + if self.buffer.s[0] == b'#': return self.buffer.s else: raise StopIteration cdef class GZIteratorParsed(GZIterator): - '''iterate line-by-line through gzip (or bgzip) + """iterate line-by-line through gzip (or bgzip) compressed file returning comments at top of file. - ''' + """ def __init__(self, parser): self.parser = parser @@ -816,14 +801,14 @@ cdef class GZIteratorParsed(GZIterator): self.buffer.l) -def tabix_compress(filename_in, +def tabix_compress(filename_in, filename_out, force=False): - '''compress *filename_in* writing the output to *filename_out*. - + """compress *filename_in* writing the output to *filename_out*. + Raise an IOError if *filename_out* already exists, unless *force* is set. - ''' + """ if not force and os.path.exists(filename_out): raise IOError( @@ -855,7 +840,7 @@ def tabix_compress(filename_in, buffer = malloc(WINDOW_SIZE) c = 1 - + while c > 0: with nogil: c = read(fd_src, buffer, WINDOW_SIZE) @@ -865,7 +850,7 @@ def tabix_compress(filename_in, if r < 0: free(buffer) raise IOError("writing failed") - + free(buffer) r = bgzf_close(fp) if r < 0: @@ -899,7 +884,7 @@ def tabix_index(filename, keep_original=False, csi=False, ): - '''index tab-separated *filename* using tabix. + """index tab-separated *filename* using tabix. An existing index will not be overwritten unless *force* is set. @@ -911,14 +896,14 @@ def tabix_index(filename, Column indices are 0-based. Note that this is different from the tabix command line utility where column indices start at 1. - + Coordinates in the file are assumed to be 1-based unless *zerobased* is set. If *preset* is provided, the column coordinates are taken from a preset. Valid values for preset are "gff", "bed", "sam", "vcf", psltbl", "pileup". - + Lines beginning with *meta_char* and the first *line_skip* lines will be skipped. @@ -927,7 +912,7 @@ def tabix_index(filename, file will be retained. *min-shift* sets the minimal interval size to 1<malloc( buffer_size ) -# self.size = buffer_size -# self.parser = parser - -# def __iter__(self): -# return self - -# cdef __cnext__(self): - -# cdef char * b -# cdef size_t nbytes -# b = self.buffer - -# while not feof( self.infile ): -# nbytes = getline( &b, &self.size, self.infile) - -# # stop at first error or eof -# if (nbytes == -1): break -# # skip comments -# if (b[0] == '#'): continue - -# # skip empty lines -# if b[0] == '\0' or b[0] == '\n' or b[0] == '\r': continue - -# # make sure that entry is complete -# if b[nbytes-1] != '\n' and b[nbytes-1] != '\r': -# result = b -# raise ValueError( "incomplete line at %s" % result ) - -# # make sure that this goes fully through C -# # otherwise buffer is copied to/from a -# # Python object causing segfaults as -# # the wrong memory is freed -# return self.parser.parse( b, nbytes ) - -# raise StopIteration - -# def __dealloc__(self): -# free(self.buffer) + return filename -# def __next__(self): -# return self.__cnext__() ######################################################### ######################################################### @@ -1111,11 +1032,11 @@ def tabix_index(filename, cdef class tabix_file_iterator: - '''iterate over a compressed or uncompressed ``infile``. - ''' + """iterate over a compressed or uncompressed ``infile``. + """ - def __cinit__(self, - infile, + def __cinit__(self, + infile, Parser parser, int buffer_size=65536): @@ -1131,20 +1052,20 @@ cdef class tabix_file_iterator: self.duplicated_fd = dup(fd) # From the manual: - # gzopen can be used to read a file which is not in gzip format; - # in this case gzread will directly read from the file without decompression. - # When reading, this will be detected automatically by looking - # for the magic two-byte gzip header. + # gzopen can be used to read a file which is not in gzip format; + # in this case gzread will directly read from the file without decompression. + # When reading, this will be detected automatically by looking + # for the magic two-byte gzip header. self.fh = bgzf_dopen(self.duplicated_fd, 'r') - if self.fh == NULL: + if self.fh == NULL: raise IOError('%s' % strerror(errno)) - self.kstream = ks_init(self.fh) - + self.kstream = ks_init(self.fh) + self.buffer.s = malloc(buffer_size) - #if self.buffer == NULL: - # raise MemoryError( "tabix_file_iterator: could not allocate %i bytes" % buffer_size) + if self.buffer.s == NULL: + raise MemoryError("tabix_file_iterator: could not allocate %i bytes" % buffer_size) #self.size = buffer_size self.parser = parser @@ -1158,20 +1079,20 @@ cdef class tabix_file_iterator: cdef int retval = 0 while 1: with nogil: - retval = ks_getuntil(self.kstream, '\n', &self.buffer, &dret) - - if retval < 0: + retval = ks_getuntil(self.kstream, b'\n', &self.buffer, &dret) + + if retval < 0: break - #raise IOError('gzip error: %s' % buildGzipError( self.fh )) + #raise IOError('gzip error: %s' % buildGzipError(self.fh)) b = self.buffer.s - + # skip comments - if (b[0] == '#'): + if b[0] == b'#': continue # skip empty lines - if b[0] == '\0' or b[0] == '\n' or b[0] == '\r': + if b[0] == b'\0' or b[0] == b'\n' or b[0] == b'\r': continue # gzgets terminates at \n, no need to test @@ -1185,19 +1106,16 @@ cdef class tabix_file_iterator: free(self.buffer.s) ks_destroy(self.kstream) bgzf_close(self.fh) - + def __next__(self): return self.__cnext__() - def next(self): - return self.__cnext__() - class tabix_generic_iterator: - '''iterate over ``infile``. - + """iterate over ``infile``. + Permits the use of file-like objects for example from the gzip module. - ''' + """ def __init__(self, infile, parser): self.infile = infile @@ -1210,7 +1128,7 @@ class tabix_generic_iterator: # cython version - required for python 3 def __next__(self): - + cdef char * b cdef char * cpy cdef size_t nbytes @@ -1227,39 +1145,35 @@ class tabix_generic_iterator: line = self.infile.readline() if not line: break - + s = force_bytes(line, encoding) b = s nbytes = len(line) - assert b[nbytes] == '\0' + assert b[nbytes] == b'\0' # skip comments - if b[0] == '#': + if b[0] == b'#': continue # skip empty lines - if b[0] == '\0' or b[0] == '\n' or b[0] == '\r': + if b[0] == b'\0' or b[0] == b'\n' or b[0] == b'\r': continue - + # make sure that entry is complete - if b[nbytes-1] != '\n' and b[nbytes-1] != '\r': + if b[nbytes-1] != b'\n' and b[nbytes-1] != b'\r': raise ValueError("incomplete line at %s" % line) - + bytes_cpy = b cpy = bytes_cpy - return self.parser(cpy, nbytes) + return self.parser(cpy, nbytes) raise StopIteration - # python version - required for python 2.7 - def next(self): - return self.__next__() - def tabix_iterator(infile, parser): """return an iterator over all entries in a file. - + Results are returned parsed as specified by the *parser*. If *parser* is None, the results are returned as an unparsed string. Otherwise, *parser* is assumed to be a functor that will return @@ -1268,30 +1182,12 @@ def tabix_iterator(infile, parser): """ return tabix_generic_iterator(infile, parser) - - # file objects can use C stdio - # used to be: isinstance( infile, file): - # if PY_MAJOR_VERSION >= 3: - # if isinstance( infile, io.IOBase ): - # return tabix_copy_iterator( infile, parser ) - # else: - # return tabix_generic_iterator( infile, parser ) - # else: -# if isinstance( infile, file ): -# return tabix_copy_iterator( infile, parser ) -# else: -# return tabix_generic_iterator( infile, parser ) - -cdef class Tabixfile(TabixFile): - """Tabixfile is deprecated: use TabixFile instead""" - pass __all__ = [ - "tabix_index", + "tabix_index", "tabix_compress", "TabixFile", - "Tabixfile", "asTuple", "asGTF", "asGFF3", @@ -1299,7 +1195,7 @@ __all__ = [ "asBed", "GZIterator", "GZIteratorHead", - "tabix_iterator", - "tabix_generic_iterator", - "tabix_file_iterator", + "tabix_iterator", + "tabix_generic_iterator", + "tabix_file_iterator", ] diff --git a/pysam/libctabixproxies.pxd b/pysam/libctabixproxies.pxd index edea701..9c93a6a 100644 --- a/pysam/libctabixproxies.pxd +++ b/pysam/libctabixproxies.pxd @@ -1,3 +1,5 @@ +# cython: language_level=3, embedsignature=True, profile=True + #cdef extern from "Python.h": # ctypedef struct FILE diff --git a/pysam/libctabixproxies.pyx b/pysam/libctabixproxies.pyx index f95425a..65c35b7 100644 --- a/pysam/libctabixproxies.pyx +++ b/pysam/libctabixproxies.pyx @@ -1,3 +1,5 @@ +# cython: language_level=3, embedsignature=True, profile=True + from cpython cimport PyBytes_FromStringAndSize from libc.stdio cimport printf, feof, fgets @@ -13,33 +15,31 @@ import copy cdef char *StrOrEmpty(char * buffer): - if buffer == NULL: - return "" - else: return buffer + if buffer == NULL: + return "" + else: + return buffer cdef int isNew(char * p, char * buffer, size_t nbytes): - """return True if `p` is located within `buffer` of size - `nbytes` - """ + """return True if `p` is located within `buffer` of size `nbytes`.""" if p == NULL: return 0 - + return not (buffer <= p <= buffer + nbytes) cdef class TupleProxy: - '''Proxy class for access to parsed row as a tuple. + """Proxy class for access to parsed row as a tuple. This class represents a table row for fast read-access. Access to individual fields is via the [] operator. - - Only read-only access is implemented. - ''' + Only read-only access is implemented. - def __cinit__(self, encoding="ascii"): + """ + def __cinit__(self, encoding="ascii"): self.data = NULL self.fields = NULL self.index = 0 @@ -72,10 +72,7 @@ cdef class TupleProxy: return n def compare(self, TupleProxy other): - '''return -1,0,1, if contents in this are binary - <,=,> to *other* - - ''' + """return -1,0,1, if contents in this are binary <,=,> to *other*.""" if self.is_modified or other.is_modified: raise NotImplementedError( 'comparison of modified TupleProxies is not implemented') @@ -98,28 +95,31 @@ cdef class TupleProxy: raise NotImplementedError(err_msg) cdef take(self, char * buffer, size_t nbytes): - '''start presenting buffer. + """start presenting buffer. Take ownership of the pointer. - ''' + + """ self.data = buffer self.nbytes = nbytes self.update(buffer, nbytes) cdef present(self, char * buffer, size_t nbytes): - '''start presenting buffer. + """start presenting buffer. Do not take ownership of the pointer. - ''' + + """ self.update(buffer, nbytes) cdef copy(self, char * buffer, size_t nbytes, bint reset=False): - '''start presenting buffer of size *nbytes*. + """start presenting buffer of size *nbytes*. Buffer is a '\0'-terminated string without the '\n'. Take a copy of buffer. - ''' + + """ # +1 for '\0' cdef int s = sizeof(char) * (nbytes + 1) self.data = malloc(s) @@ -129,24 +129,23 @@ cdef class TupleProxy: if reset: for x from 0 <= x < nbytes: - if self.data[x] == '\0': - self.data[x] = '\t' + if self.data[x] == b'\0': + self.data[x] = b'\t' self.update(self.data, nbytes) cpdef int getMinFields(self): - '''return minimum number of fields.''' + """return minimum number of fields.""" # 1 is not a valid tabix entry, but TupleProxy # could be more generic. return 1 cpdef int getMaxFields(self): - '''return maximum number of fields. Return - 0 for unknown length.''' + """maximum number of fields or 0 for unknown length.""" return 0 cdef update(self, char * buffer, size_t nbytes): - '''update internal data. + """update internal data. *buffer* is a \0 terminated string. @@ -160,7 +159,7 @@ cdef class TupleProxy: If max_fields is set, the number of fields is initialized to max_fields. - ''' + """ cdef char * pos cdef char * old_pos cdef int field @@ -176,8 +175,8 @@ cdef class TupleProxy: ################################# # remove line breaks and feeds and update number of bytes x = nbytes - 1 - while x > 0 and (buffer[x] == '\n' or buffer[x] == '\r'): - buffer[x] = '\0' + while x > 0 and (buffer[x] == b'\n' or buffer[x] == b'\r'): + buffer[x] = b'\0' x -= 1 self.nbytes = x + 1 @@ -185,11 +184,11 @@ cdef class TupleProxy: # clear data if self.fields != NULL: free(self.fields) - + for field from 0 <= field < self.nfields: if isNew(self.fields[field], self.data, self.nbytes): free(self.fields[field]) - + self.is_modified = self.nfields = 0 ################################# @@ -199,11 +198,11 @@ cdef class TupleProxy: # to guess or dynamically grow if max_fields == 0: for x from 0 <= x < nbytes: - if buffer[x] == '\t': + if buffer[x] == b'\t': max_fields += 1 max_fields += 1 - self.fields = calloc(max_fields, sizeof(char *)) + self.fields = calloc(max_fields, sizeof(char *)) if self.fields == NULL: raise ValueError("out of memory in TupleProxy.update()") @@ -214,8 +213,8 @@ cdef class TupleProxy: field += 1 old_pos = pos while 1: - - pos = memchr(pos, '\t', nbytes) + + pos = memchr(pos, b'\t', nbytes) if pos == NULL: break if field >= max_fields: @@ -223,7 +222,7 @@ cdef class TupleProxy: "parsing error: more than %i fields in line: %s" % (max_fields, buffer)) - pos[0] = '\0' + pos[0] = b'\0' pos += 1 self.fields[field] = pos field += 1 @@ -238,13 +237,13 @@ cdef class TupleProxy: (self.getMinFields(), buffer)) def _getindex(self, int index): - '''return item at idx index''' + """item at index.""" cdef int i = index if i < 0: i += self.nfields if i < 0: raise IndexError("list index out of range") - # apply offset - separating a fixed number + # apply offset - separating a fixed number # of fields from a variable number such as in VCF i += self.offset if i >= self.nfields: @@ -264,7 +263,7 @@ cdef class TupleProxy: return result def _setindex(self, index, value): - '''set item at idx index.''' + """set item at idx index.""" cdef int idx = index if idx < 0: raise IndexError("list index out of range") @@ -289,12 +288,12 @@ cdef class TupleProxy: strcpy(self.fields[idx], tmp) def __setitem__(self, index, value): - '''set item at *index* to *value*''' + """set item at *index* to *value*.""" cdef int i = index if i < 0: i += self.nfields i += self.offset - + self._setindex(i, value) def __len__(self): @@ -304,9 +303,8 @@ cdef class TupleProxy: self.index = 0 return self - def __next__(self): - """python version of next(). - """ + def __next__(self): + """python version of next().""" if self.index >= self.nfields: raise StopIteration cdef char * retval = self.fields[self.index] @@ -317,7 +315,6 @@ cdef class TupleProxy: return force_str(retval, self.encoding) def __str__(self): - '''return original data''' # copy and replace \0 bytes with \t characters cdef char * cpy if self.is_modified: @@ -332,25 +329,25 @@ cdef class TupleProxy: raise ValueError("out of memory") memcpy(cpy, self.data, self.nbytes+1) for x from 0 <= x < self.nbytes: - if cpy[x] == '\0': - cpy[x] = '\t' + if cpy[x] == b'\0': + cpy[x] = b'\t' result = cpy[:self.nbytes] free(cpy) r = result.decode(self.encoding) return r def toDot(v): - '''convert value to '.' if None''' + """convert value to '.' if None""" if v is None: - return "." + return "." else: return str(v) def quote(v): - '''return a quoted attribute.''' + """return a quoted attribute.""" if isinstance(v, str): return '"%s"' % v - else: + else: return str(v) @@ -359,7 +356,7 @@ cdef class NamedTupleProxy(TupleProxy): map_key2field = {} def __setattr__(self, key, value): - '''set attribute.''' + """set attribute.""" cdef int idx idx, f = self.map_key2field[key] if self.nfields < idx: @@ -410,7 +407,7 @@ cdef str to1based(int v): cdef class GTFProxy(NamedTupleProxy): - '''Proxy class for access to GTF fields. + """Proxy class for access to GTF fields. This class represents a GTF entry for fast read-access. Write-access has been added as well, though some care must @@ -421,7 +418,7 @@ cdef class GTFProxy(NamedTupleProxy): The only exception is the attributes field when set from a dictionary - this field will manage its own memory. - ''' + """ separator = "; " # first value is field index, the tuple contains conversion @@ -429,32 +426,33 @@ cdef class GTFProxy(NamedTupleProxy): # to pythonic value) and setting (converting pythonic value to # interval string representation) map_key2field = { - 'contig' : (0, (str, str)), - 'source' : (1, (dot_or_str, str)), + 'contig': (0, (str, str)), + 'source': (1, (dot_or_str, str)), 'feature': (2, (dot_or_str, str)), - 'start' : (3, (from1based, to1based)), - 'end' : (4, (int, int)), - 'score' : (5, (dot_or_float, toDot)), - 'strand' : (6, (dot_or_str, str)), - 'frame' : (7, (dot_or_int, toDot)), + 'start': (3, (from1based, to1based)), + 'end': (4, (int, int)), + 'score': (5, (dot_or_float, toDot)), + 'strand': (6, (dot_or_str, str)), + 'frame': (7, (dot_or_int, toDot)), 'attributes': (8, (str, str))} - - def __cinit__(self): + + def __cinit__(self): # automatically calls TupleProxy.__cinit__ self.attribute_dict = None - + cpdef int getMinFields(self): - '''return minimum number of fields.''' + """return minimum number of fields.""" return 9 cpdef int getMaxFields(self): - '''return max number of fields.''' + """return max number of fields.""" return 9 def to_dict(self): """parse attributes - return as dict The dictionary can be modified to update attributes. + """ if not self.attribute_dict: self.attribute_dict = self.attribute_string2dict( @@ -463,12 +461,11 @@ cdef class GTFProxy(NamedTupleProxy): return self.attribute_dict def as_dict(self): - """deprecated: use :meth:`to_dict` - """ + """deprecated: use :meth:`to_dict`.""" return self.to_dict() def from_dict(self, d): - '''set attributes from a dictionary.''' + """set attributes from a dictionary.""" self.attribute_dict = None attribute_string = force_bytes( self.attribute_dict2string(d), @@ -480,43 +477,42 @@ cdef class GTFProxy(NamedTupleProxy): cdef int x if self.is_modified: - return "\t".join( - (self.contig, - toDot(self.source), - toDot(self.feature), + return "\t".join( + (self.contig, + toDot(self.source), + toDot(self.feature), str(self.start + 1), str(self.end), toDot(self.score), toDot(self.strand), toDot(self.frame), self.attributes)) - else: + else: return TupleProxy.__str__(self) def invert(self, int lcontig): - '''invert coordinates to negative strand coordinates - + """invert coordinates to negative strand coordinates. + This method will only act if the feature is on the - negative strand.''' + negative strand. + """ if self.strand[0] == '-': start = min(self.start, self.end) end = max(self.start, self.end) self.start, self.end = lcontig - end, lcontig - start def keys(self): - '''return a list of attributes defined in this entry.''' + """return a list of attributes defined in this entry.""" if not self.attribute_dict: - self.attribute_dict = self.attribute_string2dict( - self.attributes) + self.attribute_dict = self.attribute_string2dict(self.attributes) return self.attribute_dict.keys() def __getitem__(self, key): return self.__getattr__(key) def setAttribute(self, name, value): - '''convenience method to set an attribute. - ''' + """convenience method to set an attribute.""" if not self.attribute_dict: self.attribute_dict = self.attribute_string2dict( self.attributes) @@ -526,7 +522,7 @@ cdef class GTFProxy(NamedTupleProxy): def attribute_string2dict(self, s): return collections.OrderedDict( self.attribute_string2iterator(s)) - + def __cmp__(self, other): return (self.contig, self.strand, self.start) < \ (other.contig, other.strand, other.start) @@ -548,9 +544,7 @@ cdef class GTFProxy(NamedTupleProxy): raise NotImplementedError(err_msg) def dict2attribute_string(self, d): - """convert dictionary to attribute string in GTF format. - - """ + """convert dictionary to attribute string in GTF format.""" aa = [] for k, v in d.items(): if isinstance(v, str): @@ -561,10 +555,7 @@ cdef class GTFProxy(NamedTupleProxy): return self.separator.join(aa) + ";" def attribute_string2iterator(self, s): - """convert attribute string in GTF format to records - and iterate over key, value pairs. - """ - + """convert attribute string in GTF format to records and iterate over key, value pairs.""" # remove comments attributes = force_str(s, encoding=self.encoding) @@ -602,13 +593,11 @@ cdef class GTFProxy(NamedTupleProxy): pass except TypeError: pass - + yield n, v - - def __getattr__(self, key): - """Generic lookup of attribute from GFF/GTF attributes - """ + def __getattr__(self, key): + """Generic lookup of attribute from GFF/GTF attributes.""" # Only called if there *isn't* an attribute with this name cdef int idx idx, f = self.map_key2field.get(key, (-1, None)) @@ -621,7 +610,7 @@ cdef class GTFProxy(NamedTupleProxy): TupleProxy._setindex(self, idx, s) self.attribute_dict = None return s - + if f[0] == str: return force_str(self.fields[idx], self.encoding) @@ -635,8 +624,7 @@ cdef class GTFProxy(NamedTupleProxy): return self.attribute_dict[key] def __setattr__(self, key, value): - '''set attribute.''' - + """set attribute.""" # Note that __setattr__ is called before properties, so __setattr__ and # properties don't mix well. This is different from __getattr__ which is # called after any properties have been resolved. @@ -659,32 +647,21 @@ cdef class GTFProxy(NamedTupleProxy): self.attribute_dict[key] = value self.is_modified = True - # for backwards compatibility - def asDict(self, *args, **kwargs): - return self.to_dict(*args, **kwargs) - - def fromDict(self, *args, **kwargs): - return self.from_dict(*args, **kwargs) - cdef class GFF3Proxy(GTFProxy): - def dict2attribute_string(self, d): """convert dictionary to attribute string.""" - return ";".join(["{}={}".format(k, v) for k, v in d.items()]) - + return ";".join("{}={}".format(k, v) for k, v in d.items()) + def attribute_string2iterator(self, s): - """convert attribute string in GFF3 format to records - and iterate over key, value pairs. - """ - + """convert attribute string in GFF3 format to records and iterate over key, value pairs.""" for f in (x.strip() for x in s.split(";")): if not f: continue key, value = f.split("=", 1) value = value.strip() - + ## try to convert to a value try: value = float(value) @@ -693,42 +670,45 @@ cdef class GFF3Proxy(GTFProxy): pass except TypeError: pass - + yield key.strip(), value - + cdef class BedProxy(NamedTupleProxy): - '''Proxy class for access to Bed fields. + """Proxy class for access to Bed fields. This class represents a BED entry for fast read-access. - ''' + + """ map_key2field = { - 'contig' : (0, str), - 'start' : (1, int), - 'end' : (2, int), - 'name' : (3, str), - 'score' : (4, float), - 'strand' : (5, str), - 'thickStart' : (6, int), - 'thickEnd' : (7, int), - 'itemRGB' : (8, str), + 'contig': (0, str), + 'start': (1, int), + 'end': (2, int), + 'name': (3, str), + 'score': (4, float), + 'strand': (5, str), + 'thickStart': (6, int), + 'thickEnd': (7, int), + 'itemRGB': (8, str), 'blockCount': (9, int), 'blockSizes': (10, str), - 'blockStarts': (11, str), } + 'blockStarts': (11, str), + } cpdef int getMinFields(self): - '''return minimum number of fields.''' + """minimum number of fields.""" return 3 cpdef int getMaxFields(self): - '''return max number of fields.''' + """max number of fields.""" return 12 cdef update(self, char * buffer, size_t nbytes): - '''update internal data. + """update internal data. nbytes does not include the terminal '\0'. - ''' + + """ TupleProxy.update(self, buffer, nbytes) if self.nfields < 3: @@ -740,7 +720,7 @@ cdef class BedProxy(NamedTupleProxy): # do automatic conversion self.contig = self.fields[0] - self.start = atoi(self.fields[1]) + self.start = atoi(self.fields[1]) self.end = atoi(self.fields[2]) # __setattr__ in base class seems to take precedence @@ -751,7 +731,6 @@ cdef class BedProxy(NamedTupleProxy): # def __get__( self ): return self.end def __str__(self): - cdef int save_fields = self.nfields # ensure fields to use correct format self.nfields = self.bedfields @@ -760,7 +739,7 @@ cdef class BedProxy(NamedTupleProxy): return retval def __setattr__(self, key, value): - '''set attribute.''' + """set attribute.""" if key == "start": self.start = value elif key == "end": @@ -769,53 +748,56 @@ cdef class BedProxy(NamedTupleProxy): cdef int idx idx, f = self.map_key2field[key] TupleProxy._setindex(self, idx, str(value)) - + cdef class VCFProxy(NamedTupleProxy): - '''Proxy class for access to VCF fields. + """Proxy class for access to VCF fields. The genotypes are accessed via a numeric index. Sample headers are not available. - ''' - map_key2field = { - 'contig' : (0, str), - 'pos' : (1, int), - 'id' : (2, str), - 'ref' : (3, str), - 'alt' : (4, str), - 'qual' : (5, str), - 'filter' : (6, str), - 'info' : (7, str), - 'format' : (8, str) } - - def __cinit__(self): + + """ + map_key2field = { + 'contig': (0, str), + 'pos': (1, int), + 'id': (2, str), + 'ref': (3, str), + 'alt': (4, str), + 'qual': (5, str), + 'filter': (6, str), + 'info': (7, str), + 'format': (8, str) + } + + def __cinit__(self): # automatically calls TupleProxy.__cinit__ # start indexed access at genotypes self.offset = 9 cdef update(self, char * buffer, size_t nbytes): - '''update internal data. - + """update internal data. + nbytes does not include the terminal '\0'. - ''' + + """ TupleProxy.update(self, buffer, nbytes) self.contig = self.fields[0] # vcf counts from 1 - correct here self.pos = atoi(self.fields[1]) - 1 - + def __len__(self): - '''return number of genotype fields.''' + """return number of genotype fields.""" return max(0, self.nfields - 9) - property pos: - '''feature end (in 0-based open/closed coordinates).''' - def __get__(self): - return self.pos + @property + def pos(self): + """feature end (in 0-based open/closed coordinates).""" + return self.pos def __setattr__(self, key, value): - '''set attribute.''' - if key == "pos": + """set attribute.""" + if key == "pos": self.pos = value value += 1 diff --git a/pysam/libcutils.pxd b/pysam/libcutils.pxd index 9e1cce1..f480bc1 100644 --- a/pysam/libcutils.pxd +++ b/pysam/libcutils.pxd @@ -1,3 +1,5 @@ +# cython: language_level=3, embedsignature=True, profile=True + ######################################################################### # Utility functions used across pysam ######################################################################### diff --git a/pysam/libcutils.pyx b/pysam/libcutils.pyx index b1fd5df..4a343aa 100644 --- a/pysam/libcutils.pyx +++ b/pysam/libcutils.pyx @@ -1,3 +1,5 @@ +# cython: language_level=3, embedsignature=True, profile=True + import types import sys import string diff --git a/pysam/libcvcf.pxd b/pysam/libcvcf.pxd deleted file mode 100644 index 139597f..0000000 --- a/pysam/libcvcf.pxd +++ /dev/null @@ -1,2 +0,0 @@ - - diff --git a/pysam/libcvcf.pyx b/pysam/libcvcf.pyx deleted file mode 100644 index 956f8a5..0000000 --- a/pysam/libcvcf.pyx +++ /dev/null @@ -1,1203 +0,0 @@ -# cython: embedsignature=True -# -# Code to read, write and edit VCF files -# -# VCF lines are encoded as a dictionary with these keys (note: all lowercase): -# 'chrom': string -# 'pos': integer -# 'id': string -# 'ref': string -# 'alt': list of strings -# 'qual': integer -# 'filter': None (missing value), or list of keys (strings); empty list parsed as ["PASS"] -# 'info': dictionary of values (see below) -# 'format': list of keys (strings) -# sample keys: dictionary of values (see below) -# -# The sample keys are accessible through vcf.getsamples() -# -# A dictionary of values contains value keys (defined in ##INFO or -# ##FORMAT lines) which map to a list, containing integers, floats, -# strings, or characters. Missing values are replaced by a particular -# value, often -1 or . -# -# Genotypes are not stored as a string, but as a list of 1 or 3 -# elements (for haploid and diploid samples), the first (and last) the -# integer representing an allele, and the second the separation -# character. Note that there is just one genotype per sample, but for -# consistency the single element is stored in a list. -# -# Header lines other than ##INFO, ##FORMAT and ##FILTER are stored as -# (key, value) pairs and are accessible through getheader() -# -# The VCF class can be instantiated with a 'regions' variable -# consisting of tuples (chrom,start,end) encoding 0-based half-open -# segments. Only variants with a position inside the segment will be -# parsed. A regions parser is available under parse_regions. -# -# When instantiated, a reference can be passed to the VCF class. This -# may be any class that supports a fetch(chrom, start, end) method. -# -# NOTE: the position that is returned to Python is 0-based, NOT -# 1-based as in the VCF file. -# NOTE: There is also preliminary VCF functionality in the VariantFile class. -# -# TODO: -# only v4.0 writing is complete; alleles are not converted to v3.3 format -# - -from collections import namedtuple, defaultdict -from operator import itemgetter -import sys, re, copy, bisect - -from libc.stdlib cimport atoi -from libc.stdint cimport int8_t, int16_t, int32_t, int64_t -from libc.stdint cimport uint8_t, uint16_t, uint32_t, uint64_t - -cimport pysam.libctabix as libctabix -cimport pysam.libctabixproxies as libctabixproxies - -from pysam.libcutils cimport force_str - -import pysam - -gtsRegEx = re.compile("[|/\\\\]") -alleleRegEx = re.compile('^[ACGTN]+$') - -# Utility function. Uses 0-based coordinates -def get_sequence(chrom, start, end, fa): - # obtain sequence from .fa file, without truncation - if end<=start: return "" - if not fa: return "N"*(end-start) - if start<0: return "N"*(-start) + get_sequence(chrom, 0, end, fa).upper() - sequence = fa.fetch(chrom, start, end).upper() - if len(sequence) < end-start: sequence += "N"*(end-start-len(sequence)) - return sequence - -# Utility function. Parses a region string -def parse_regions( string ): - result = [] - for r in string.split(','): - elts = r.split(':') - chrom, start, end = elts[0], 0, 3000000000 - if len(elts)==1: pass - elif len(elts)==2: - if len(elts[1])>0: - ielts = elts[1].split('-') - if len(ielts) != 2: ValueError("Don't understand region string '%s'" % r) - try: start, end = int(ielts[0])-1, int(ielts[1]) - except: raise ValueError("Don't understand region string '%s'" % r) - else: - raise ValueError("Don't understand region string '%s'" % r) - result.append( (chrom,start,end) ) - return result - - -FORMAT = namedtuple('FORMAT','id numbertype number type description missingvalue') - -########################################################################################################### -# -# New class -# -########################################################################################################### - -cdef class VCFRecord(libctabixproxies.TupleProxy): - '''vcf record. - - initialized from data and vcf meta - ''' - - cdef vcf - cdef char * contig - cdef uint32_t pos - - def __init__(self, vcf): - self.vcf = vcf - self.encoding = vcf.encoding - - # if len(data) != len(self.vcf._samples): - # self.vcf.error(str(data), - # self.BAD_NUMBER_OF_COLUMNS, - # "expected %s for %s samples (%s), got %s" % \ - # (len(self.vcf._samples), - # len(self.vcf._samples), - # self.vcf._samples, - # len(data))) - - def __cinit__(self, vcf): - # start indexed access at genotypes - self.offset = 9 - - self.vcf = vcf - self.encoding = vcf.encoding - - def error(self, line, error, opt=None): - '''raise error.''' - # pass to vcf file for error handling - return self.vcf.error(line, error, opt) - - cdef update(self, char * buffer, size_t nbytes): - '''update internal data. - - nbytes does not include the terminal '\0'. - ''' - libctabixproxies.TupleProxy.update(self, buffer, nbytes) - - self.contig = self.fields[0] - # vcf counts from 1 - correct here - self.pos = atoi(self.fields[1]) - 1 - - def __len__(self): - return max(0, self.nfields - 9) - - property contig: - def __get__(self): return self.contig - - property pos: - def __get__(self): return self.pos - - property id: - def __get__(self): return self.fields[2] - - property ref: - def __get__(self): - return self.fields[3] - - property alt: - def __get__(self): - # convert v3.3 to v4.0 alleles below - alt = self.fields[4] - if alt == ".": alt = [] - else: alt = alt.upper().split(',') - return alt - - property qual: - def __get__(self): - qual = self.fields[5] - if qual == b".": qual = -1 - else: - try: qual = float(qual) - except: self.vcf.error(str(self),self.QUAL_NOT_NUMERICAL) - return qual - - property filter: - def __get__(self): - f = self.fields[6] - # postpone checking that filters exist. Encode missing filter or no filtering as empty list - if f == b"." or f == b"PASS" or f == b"0": return [] - else: return f.split(';') - - property info: - def __get__(self): - col = self.fields[7] - # dictionary of keys, and list of values - info = {} - if col != b".": - for blurp in col.split(';'): - elts = blurp.split('=') - if len(elts) == 1: v = None - elif len(elts) == 2: v = elts[1] - else: self.vcf.error(str(self),self.ERROR_INFO_STRING) - info[elts[0]] = self.vcf.parse_formatdata(elts[0], v, self.vcf._info, str(self.vcf)) - return info - - property format: - def __get__(self): - return self.fields[8].split(':') - - property samples: - def __get__(self): - return self.vcf._samples - - def __getitem__(self, key): - - # parse sample columns - values = self.fields[self.vcf._sample2column[key]].split(':') - alt = self.alt - format = self.format - - if len(values) > len(format): - self.vcf.error(str(self.line),self.BAD_NUMBER_OF_VALUES,"(found %s values in element %s; expected %s)" %\ - (len(values),key,len(format))) - - result = {} - for idx in range(len(format)): - expected = self.vcf.get_expected(format[idx], self.vcf._format, alt) - if idx < len(values): value = values[idx] - else: - if expected == -1: value = "." - else: value = ",".join(["."]*expected) - - result[format[idx]] = self.vcf.parse_formatdata(format[idx], value, self.vcf._format, str(self.data)) - if expected != -1 and len(result[format[idx]]) != expected: - self.vcf.error(str(self.data),self.BAD_NUMBER_OF_PARAMETERS, - "id=%s, expected %s parameters, got %s" % (format[idx],expected,result[format[idx]])) - if len(result[format[idx]] ) < expected: result[format[idx]] += [result[format[idx]][-1]]*(expected-len(result[format[idx]])) - result[format[idx]] = result[format[idx]][:expected] - - return result - - -cdef class asVCFRecord(libctabix.Parser): - '''converts a :term:`tabix row` into a VCF record.''' - cdef vcffile - def __init__(self, vcffile): - self.vcffile = vcffile - - cdef parse(self, char * buffer, int len): - cdef VCFRecord r - r = VCFRecord(self.vcffile) - r.copy(buffer, len) - return r - -class VCF(object): - - # types - NT_UNKNOWN = 0 - NT_NUMBER = 1 - NT_ALLELES = 2 - NT_NR_ALLELES = 3 - NT_GENOTYPES = 4 - NT_PHASED_GENOTYPES = 5 - - _errors = { 0:"UNKNOWN_FORMAT_STRING:Unknown file format identifier", - 1:"BADLY_FORMATTED_FORMAT_STRING:Formatting error in the format string", - 2:"BADLY_FORMATTED_HEADING:Did not find 9 required headings (CHROM, POS, ..., FORMAT) %s", - 3:"BAD_NUMBER_OF_COLUMNS:Wrong number of columns found (%s)", - 4:"POS_NOT_NUMERICAL:Position column is not numerical", - 5:"UNKNOWN_CHAR_IN_REF:Unknown character in reference field", - 6:"V33_BAD_REF:Reference should be single-character in v3.3 VCF", - 7:"V33_BAD_ALLELE:Cannot interpret allele for v3.3 VCF", - 8:"POS_NOT_POSITIVE:Position field must be >0", - 9:"QUAL_NOT_NUMERICAL:Quality field must be numerical, or '.'", - 10:"ERROR_INFO_STRING:Error while parsing info field", - 11:"ERROR_UNKNOWN_KEY:Unknown key (%s) found in formatted field (info; format; or filter)", - 12:"ERROR_FORMAT_NOT_NUMERICAL:Expected integer or float in formatted field; got %s", - 13:"ERROR_FORMAT_NOT_CHAR:Eexpected character in formatted field; got string", - 14:"FILTER_NOT_DEFINED:Identifier (%s) in filter found which was not defined in header", - 15:"FORMAT_NOT_DEFINED:Identifier (%s) in format found which was not defined in header", - 16:"BAD_NUMBER_OF_VALUES:Found too many of values in sample column (%s)", - 17:"BAD_NUMBER_OF_PARAMETERS:Found unexpected number of parameters (%s)", - 18:"BAD_GENOTYPE:Cannot parse genotype (%s)", - 19:"V40_BAD_ALLELE:Bad allele found for v4.0 VCF (%s)", - 20:"MISSING_REF:Reference allele missing", - 21:"V33_UNMATCHED_DELETION:Deleted sequence does not match reference (%s)", - 22:"V40_MISSING_ANGLE_BRACKETS:Format definition is not deliminted by angular brackets", - 23:"FORMAT_MISSING_QUOTES:Description field in format definition is not surrounded by quotes", - 24:"V40_FORMAT_MUST_HAVE_NAMED_FIELDS:Fields in v4.0 VCF format definition must have named fields", - 25:"HEADING_NOT_SEPARATED_BY_TABS:Heading line appears separated by spaces, not tabs", - 26:"WRONG_REF:Wrong reference %s", - 27:"ERROR_TRAILING_DATA:Numerical field ('%s') has semicolon-separated trailing data", - 28:"BAD_CHR_TAG:Error calculating chr tag for %s", - 29:"ZERO_LENGTH_ALLELE:Found zero-length allele", - 30:"MISSING_INDEL_ALLELE_REF_BASE:Indel alleles must begin with single reference base", - 31:"ZERO_FOR_NON_FLAG_FIELD: number set to 0, but type is not 'FLAG'", - 32:"ERROR_FORMAT_NOT_INTEGER:Expected integer in formatted field; got %s", - 33:"ERROR_FLAG_HAS_VALUE:Flag fields should not have a value", - } - - # tag-value pairs; tags are not unique; does not include fileformat, INFO, FILTER or FORMAT fields - _header = [] - - # version number; 33=v3.3; 40=v4.0 - _version = 40 - - # info, filter and format data - _info = {} - _filter = {} - _format = {} - - # header; and required columns - _required = ["CHROM","POS","ID","REF","ALT","QUAL","FILTER","INFO","FORMAT"] - _samples = [] - - # control behaviour - _ignored_errors = set([11,31]) # ERROR_UNKNOWN_KEY, ERROR_ZERO_FOR_NON_FLAG_FIELD - _warn_errors = set([]) - _leftalign = False - - # reference sequence - _reference = None - - # regions to include; None includes everything - _regions = None - - # statefull stuff - _lineno = -1 - _line = None - _lines = None - - def __init__(self, _copy=None, reference=None, regions=None, - lines=None, leftalign=False): - # make error identifiers accessible by name - for id in self._errors.keys(): - self.__dict__[self._errors[id].split(':')[0]] = id - if _copy != None: - self._leftalign = _copy._leftalign - self._header = _copy._header[:] - self._version = _copy._version - self._info = copy.deepcopy(_copy._info) - self._filter = copy.deepcopy(_copy._filter) - self._format = copy.deepcopy(_copy._format) - self._samples = _copy._samples[:] - self._sample2column = copy.deepcopy(_copy._sample2column) - self._ignored_errors = copy.deepcopy(_copy._ignored_errors) - self._warn_errors = copy.deepcopy(_copy._warn_errors) - self._reference = _copy._reference - self._regions = _copy._regions - if reference: self._reference = reference - if regions: self._regions = regions - if leftalign: self._leftalign = leftalign - self._lines = lines - self.encoding = "ascii" - self.tabixfile = None - - def error(self,line,error,opt=None): - if error in self._ignored_errors: return - errorlabel, errorstring = self._errors[error].split(':') - if opt: errorstring = errorstring % opt - errwarn = ["Error","Warning"][error in self._warn_errors] - errorstring += " in line %s: '%s'\n%s %s: %s\n" % (self._lineno,line,errwarn,errorlabel,errorstring) - if error in self._warn_errors: return - raise ValueError(errorstring) - - def parse_format(self,line,format,filter=False): - if self._version == 40: - if not format.startswith('<'): - self.error(line,self.V40_MISSING_ANGLE_BRACKETS) - format = "<"+format - if not format.endswith('>'): - self.error(line,self.V40_MISSING_ANGLE_BRACKETS) - format += ">" - format = format[1:-1] - data = {'id':None,'number':None,'type':None,'descr':None} - idx = 0 - while len(format.strip())>0: - elts = format.strip().split(',') - first, rest = elts[0], ','.join(elts[1:]) - if first.find('=') == -1 or (first.find('"')>=0 and first.find('=') > first.find('"')): - if self._version == 40: self.error(line,self.V40_FORMAT_MUST_HAVE_NAMED_FIELDS) - if idx == 4: self.error(line,self.BADLY_FORMATTED_FORMAT_STRING) - first = ["ID=","Number=","Type=","Description="][idx] + first - if first.startswith('ID='): data['id'] = first.split('=')[1] - elif first.startswith('Number='): data['number'] = first.split('=')[1] - elif first.startswith('Type='): data['type'] = first.split('=')[1] - elif first.startswith('Description='): - elts = format.split('"') - if len(elts)<3: - self.error(line,self.FORMAT_MISSING_QUOTES) - elts = first.split('=') + [rest] - data['descr'] = elts[1] - rest = '"'.join(elts[2:]) - if rest.startswith(','): rest = rest[1:] - else: - self.error(line,self.BADLY_FORMATTED_FORMAT_STRING) - format = rest - idx += 1 - if filter and idx==1: idx=3 # skip number and type fields for FILTER format strings - if not data['id']: self.error(line,self.BADLY_FORMATTED_FORMAT_STRING) - if 'descr' not in data: - # missing description - self.error(line,self.BADLY_FORMATTED_FORMAT_STRING) - data['descr'] = "" - if not data['type'] and not data['number']: - # fine, ##filter format - return FORMAT(data['id'],self.NT_NUMBER,0,"Flag",data['descr'],'.') - if not data['type'] in ["Integer","Float","Character","String","Flag"]: - self.error(line,self.BADLY_FORMATTED_FORMAT_STRING) - # I would like a missing-value field, but it isn't there - if data['type'] in ['Integer','Float']: data['missing'] = None # Do NOT use arbitrary int/float as missing value - else: data['missing'] = '.' - if not data['number']: self.error(line,self.BADLY_FORMATTED_FORMAT_STRING) - try: - n = int(data['number']) - t = self.NT_NUMBER - except ValueError: - n = -1 - if data['number'] == '.': t = self.NT_UNKNOWN - elif data['number'] == '#alleles': t = self.NT_ALLELES - elif data['number'] == '#nonref_alleles': t = self.NT_NR_ALLELES - elif data['number'] == '#genotypes': t = self.NT_GENOTYPES - elif data['number'] == '#phased_genotypes': t = self.NT_PHASED_GENOTYPES - elif data['number'] == '#phased_genotypes': t = self.NT_PHASED_GENOTYPES - # abbreviations added in VCF version v4.1 - elif data['number'] == 'A': t = self.NT_ALLELES - elif data['number'] == 'G': t = self.NT_GENOTYPES - else: - self.error(line,self.BADLY_FORMATTED_FORMAT_STRING) - # if number is 0 - type must be Flag - if n == 0 and data['type'] != 'Flag': - self.error( line, self.ZERO_FOR_NON_FLAG_FIELD) - # force type 'Flag' if no number - data['type'] = 'Flag' - - return FORMAT(data['id'],t,n,data['type'],data['descr'],data['missing']) - - def format_format( self, fmt, filter=False ): - values = [('ID',fmt.id)] - if fmt.number != None and not filter: - if fmt.numbertype == self.NT_UNKNOWN: nmb = "." - elif fmt.numbertype == self.NT_NUMBER: nmb = str(fmt.number) - elif fmt.numbertype == self.NT_ALLELES: nmb = "#alleles" - elif fmt.numbertype == self.NT_NR_ALLELES: nmb = "#nonref_alleles" - elif fmt.numbertype == self.NT_GENOTYPES: nmb = "#genotypes" - elif fmt.numbertype == self.NT_PHASED_GENOTYPES: nmb = "#phased_genotypes" - else: - raise ValueError("Unknown number type encountered: %s" % fmt.numbertype) - values.append( ('Number',nmb) ) - values.append( ('Type', fmt.type) ) - values.append( ('Description', '"' + fmt.description + '"') ) - if self._version == 33: - format = ",".join([v for k,v in values]) - else: - format = "<" + (",".join( ["%s=%s" % (k,v) for (k,v) in values] )) + ">" - return format - - def get_expected(self, format, formatdict, alt): - fmt = formatdict[format] - if fmt.numbertype == self.NT_UNKNOWN: return -1 - if fmt.numbertype == self.NT_NUMBER: return fmt.number - if fmt.numbertype == self.NT_ALLELES: return len(alt)+1 - if fmt.numbertype == self.NT_NR_ALLELES: return len(alt) - if fmt.numbertype == self.NT_GENOTYPES: return ((len(alt)+1)*(len(alt)+2)) // 2 - if fmt.numbertype == self.NT_PHASED_GENOTYPES: return (len(alt)+1)*(len(alt)+1) - return 0 - - - def _add_definition(self, formatdict, key, data, line ): - if key in formatdict: return - self.error(line,self.ERROR_UNKNOWN_KEY,key) - if data == None: - formatdict[key] = FORMAT(key,self.NT_NUMBER,0,"Flag","(Undefined tag)",".") - return - if data == []: data = [""] # unsure what type -- say string - if type(data[0]) == type(0.0): - formatdict[key] = FORMAT(key,self.NT_UNKNOWN,-1,"Float","(Undefined tag)",None) - return - if type(data[0]) == type(0): - formatdict[key] = FORMAT(key,self.NT_UNKNOWN,-1,"Integer","(Undefined tag)",None) - return - formatdict[key] = FORMAT(key,self.NT_UNKNOWN,-1,"String","(Undefined tag)",".") - - - # todo: trim trailing missing values - def format_formatdata( self, data, format, key=True, value=True, separator=":" ): - output, sdata = [], [] - if type(data) == type([]): # for FORMAT field, make data with dummy values - d = {} - for k in data: d[k] = [] - data = d - # convert missing values; and silently add definitions if required - for k in data: - self._add_definition( format, k, data[k], "(output)" ) - for idx,v in enumerate(data[k]): - if v == format[k].missingvalue: data[k][idx] = "." - # make sure GT comes first; and ensure fixed ordering; also convert GT data back to string - for k in data: - if k != 'GT': sdata.append( (k,data[k]) ) - sdata.sort() - if 'GT' in data: - sdata = [('GT',map(self.convertGTback,data['GT']))] + sdata - for k,v in sdata: - if v == []: v = None - if key and value: - if v != None: output.append( k+"="+','.join(map(str,v)) ) - else: output.append( k ) - elif key: output.append(k) - elif value: - if v != None: output.append( ','.join(map(str,v)) ) - else: output.append( "." ) # should not happen - # snip off trailing missing data - while len(output) > 1: - last = output[-1].replace(',','').replace('.','') - if len(last)>0: break - output = output[:-1] - return separator.join(output) - - - def enter_default_format(self): - for f in [FORMAT('GT',self.NT_NUMBER,1,'String','Genotype','.'), - FORMAT('DP',self.NT_NUMBER,1,'Integer','Read depth at this position for this sample',-1), - FORMAT('FT',self.NT_NUMBER,1,'String','Sample Genotype Filter','.'), - FORMAT('GL',self.NT_UNKNOWN,-1,'Float','Genotype likelihoods','.'), - FORMAT('GLE',self.NT_UNKNOWN,-1,'Float','Genotype likelihoods','.'), - FORMAT('GQ',self.NT_NUMBER,1,'Integer','Genotype Quality',-1), - FORMAT('PL',self.NT_GENOTYPES,-1,'Integer','Phred-scaled genotype likelihoods', '.'), - FORMAT('GP',self.NT_GENOTYPES,-1,'Float','Genotype posterior probabilities','.'), - FORMAT('GQ',self.NT_GENOTYPES,-1,'Integer','Conditional genotype quality','.'), - FORMAT('HQ',self.NT_UNKNOWN,-1,'Integer','Haplotype Quality',-1), # unknown number, since may be haploid - FORMAT('PS',self.NT_UNKNOWN,-1,'Integer','Phase set','.'), - FORMAT('PQ',self.NT_NUMBER,1,'Integer','Phasing quality',-1), - FORMAT('EC',self.NT_ALLELES,1,'Integer','Expected alternate allel counts',-1), - FORMAT('MQ',self.NT_NUMBER,1,'Integer','RMS mapping quality',-1), - ]: - if f.id not in self._format: - self._format[f.id] = f - - def parse_header(self, line): - - assert line.startswith('##') - elts = line[2:].split('=') - key = elts[0].strip() - value = '='.join(elts[1:]).strip() - if key == "fileformat": - if value == "VCFv3.3": - self._version = 33 - elif value == "VCFv4.0": - self._version = 40 - elif value == "VCFv4.1": - # AH - for testing - self._version = 40 - elif value == "VCFv4.2": - # AH - for testing - self._version = 40 - else: - self.error(line,self.UNKNOWN_FORMAT_STRING) - elif key == "INFO": - f = self.parse_format(line, value) - self._info[ f.id ] = f - elif key == "FILTER": - f = self.parse_format(line, value, filter=True) - self._filter[ f.id ] = f - elif key == "FORMAT": - f = self.parse_format(line, value) - self._format[ f.id ] = f - else: - # keep other keys in the header field - self._header.append( (key,value) ) - - - def write_header( self, stream ): - stream.write("##fileformat=VCFv%s.%s\n" % (self._version // 10, self._version % 10)) - for key,value in self._header: stream.write("##%s=%s\n" % (key,value)) - for var,label in [(self._info,"INFO"),(self._filter,"FILTER"),(self._format,"FORMAT")]: - for f in var.itervalues(): stream.write("##%s=%s\n" % (label,self.format_format(f,filter=(label=="FILTER")))) - - - def parse_heading( self, line ): - assert line.startswith('#') - assert not line.startswith('##') - headings = line[1:].split('\t') - # test for 8, as FORMAT field might be missing - if len(headings)==1 and len(line[1:].split()) >= 8: - self.error(line,self.HEADING_NOT_SEPARATED_BY_TABS) - headings = line[1:].split() - - for i,s in enumerate(self._required): - - if len(headings)<=i or headings[i] != s: - - if len(headings) <= i: - err = "(%sth entry not found)" % (i+1) - else: - err = "(found %s, expected %s)" % (headings[i],s) - - #self.error(line,self.BADLY_FORMATTED_HEADING,err) - # allow FORMAT column to be absent - if len(headings) == 8: - headings.append("FORMAT") - else: - self.error(line,self.BADLY_FORMATTED_HEADING,err) - - self._samples = headings[9:] - self._sample2column = dict( [(y,x+9) for x,y in enumerate( self._samples ) ] ) - - def write_heading( self, stream ): - stream.write("#" + "\t".join(self._required + self._samples) + "\n") - - def convertGT(self, GTstring): - if GTstring == ".": return ["."] - try: - gts = gtsRegEx.split(GTstring) - if len(gts) == 1: return [int(gts[0])] - if len(gts) != 2: raise ValueError() - if gts[0] == "." and gts[1] == ".": return [gts[0],GTstring[len(gts[0]):-len(gts[1])],gts[1]] - return [int(gts[0]),GTstring[len(gts[0]):-len(gts[1])],int(gts[1])] - except ValueError: - self.error(self._line,self.BAD_GENOTYPE,GTstring) - return [".","|","."] - - def convertGTback(self, GTdata): - return ''.join(map(str,GTdata)) - - def parse_formatdata( self, key, value, formatdict, line ): - # To do: check that the right number of values is present - f = formatdict.get(key,None) - if f == None: - self._add_definition(formatdict, key, value, line ) - f = formatdict[key] - if f.type == "Flag": - if value is not None: self.error(line,self.ERROR_FLAG_HAS_VALUE) - return [] - values = value.split(',') - # deal with trailing data in some early VCF files - if f.type in ["Float","Integer"] and len(values)>0 and values[-1].find(';') > -1: - self.error(line,self.ERROR_TRAILING_DATA,values[-1]) - values[-1] = values[-1].split(';')[0] - if f.type == "Integer": - for idx,v in enumerate(values): - try: - if v == ".": values[idx] = f.missingvalue - else: values[idx] = int(v) - except: - self.error(line,self.ERROR_FORMAT_NOT_INTEGER,"%s=%s" % (key, str(values))) - return [0] * len(values) - return values - elif f.type == "String": - self._line = line - if f.id == "GT": values = list(map( self.convertGT, values )) - return values - elif f.type == "Character": - for v in values: - if len(v) != 1: self.error(line,self.ERROR_FORMAT_NOT_CHAR) - return values - elif f.type == "Float": - for idx,v in enumerate(values): - if v == ".": values[idx] = f.missingvalue - try: return list(map(float,values)) - except: - self.error(line,self.ERROR_FORMAT_NOT_NUMERICAL,"%s=%s" % (key, str(values))) - return [0.0] * len(values) - else: - # can't happen - self.error(line,self.ERROR_INFO_STRING) - - def inregion(self, chrom, pos): - if not self._regions: return True - for r in self._regions: - if r[0] == chrom and r[1] <= pos < r[2]: return True - return False - - def parse_data( self, line, lineparse=False ): - cols = line.split('\t') - if len(cols) != len(self._samples)+9: - # gracefully deal with absent FORMAT column - # and those missing samples - if len(cols) == 8: - cols.append("") - else: - self.error(line, - self.BAD_NUMBER_OF_COLUMNS, - "expected %s for %s samples (%s), got %s" % (len(self._samples)+9, len(self._samples), self._samples, len(cols))) - - chrom = cols[0] - - # get 0-based position - try: pos = int(cols[1])-1 - except: self.error(line,self.POS_NOT_NUMERICAL) - if pos < 0: self.error(line,self.POS_NOT_POSITIVE) - - # implement filtering - if not self.inregion(chrom,pos): return None - - # end of first-pass parse for sortedVCF - if lineparse: return chrom, pos, line - - id = cols[2] - - ref = cols[3].upper() - if ref == ".": - self.error(line,self.MISSING_REF) - if self._version == 33: ref = get_sequence(chrom,pos,pos+1,self._reference) - else: ref = "" - else: - for c in ref: - if c not in "ACGTN": self.error(line,self.UNKNOWN_CHAR_IN_REF) - if "N" in ref: ref = get_sequence(chrom,pos,pos+len(ref),self._reference) - - # make sure reference is sane - if self._reference: - left = max(0,pos-100) - faref_leftflank = get_sequence(chrom,left,pos+len(ref),self._reference) - faref = faref_leftflank[pos-left:] - if faref != ref: self.error(line,self.WRONG_REF,"(reference is %s, VCF says %s)" % (faref,ref)) - ref = faref - - # convert v3.3 to v4.0 alleles below - if cols[4] == ".": alt = [] - else: alt = cols[4].upper().split(',') - - if cols[5] == ".": qual = -1 - else: - try: qual = float(cols[5]) - except: self.error(line,self.QUAL_NOT_NUMERICAL) - - # postpone checking that filters exist. Encode missing filter or no filtering as empty list - if cols[6] == "." or cols[6] == "PASS" or cols[6] == "0": filter = [] - else: filter = cols[6].split(';') - - # dictionary of keys, and list of values - info = {} - if cols[7] != ".": - for blurp in cols[7].split(';'): - elts = blurp.split('=') - if len(elts) == 1: v = None - elif len(elts) == 2: v = elts[1] - else: self.error(line,self.ERROR_INFO_STRING) - info[elts[0]] = self.parse_formatdata(elts[0], - v, - self._info, - line) - - # Gracefully deal with absent FORMAT column - if cols[8] == "": format = [] - else: format = cols[8].split(':') - - # check: all filters are defined - for f in filter: - if f not in self._filter: self.error(line,self.FILTER_NOT_DEFINED, f) - - # check: format fields are defined - if self._format: - for f in format: - if f not in self._format: self.error(line,self.FORMAT_NOT_DEFINED, f) - - # convert v3.3 alleles - if self._version == 33: - if len(ref) != 1: self.error(line,self.V33_BAD_REF) - newalts = [] - have_deletions = False - for a in alt: - if len(a) == 1: a = a + ref[1:] # SNP; add trailing reference - elif a.startswith('I'): a = ref[0] + a[1:] + ref[1:] # insertion just beyond pos; add first and trailing reference - elif a.startswith('D'): # allow D and D - have_deletions = True - try: - l = int(a[1:]) # throws ValueError if sequence - if len(ref) < l: # add to reference if necessary - addns = get_sequence(chrom,pos+len(ref),pos+l,self._reference) - ref += addns - for i,na in enumerate(newalts): newalts[i] = na+addns - a = ref[l:] # new deletion, deleting pos...pos+l - except ValueError: - s = a[1:] - if len(ref) < len(s): # add Ns to reference if necessary - addns = get_sequence(chrom,pos+len(ref),pos+len(s),self._reference) - if not s.endswith(addns) and addns != 'N'*len(addns): - self.error(line,self.V33_UNMATCHED_DELETION, - "(deletion is %s, reference is %s)" % (a,get_sequence(chrom,pos,pos+len(s),self._reference))) - ref += addns - for i,na in enumerate(newalts): newalts[i] = na+addns - a = ref[len(s):] # new deletion, deleting from pos - else: - self.error(line,self.V33_BAD_ALLELE) - newalts.append(a) - alt = newalts - # deletion alleles exist, add dummy 1st reference allele, and account for leading base - if have_deletions: - if pos == 0: - # Petr Danacek's: we can't have a leading nucleotide at (1-based) position 1 - addn = get_sequence(chrom,pos+len(ref),pos+len(ref)+1,self._reference) - ref += addn - alt = [allele+addn for allele in alt] - else: - addn = get_sequence(chrom,pos-1,pos,self._reference) - ref = addn + ref - alt = [addn + allele for allele in alt] - pos -= 1 - else: - # format v4.0 -- just check for nucleotides - for allele in alt: - if not alleleRegEx.match(allele): - self.error(line,self.V40_BAD_ALLELE,allele) - - # check for leading nucleotide in indel calls - for allele in alt: - if len(allele) != len(ref): - if len(allele) == 0: self.error(line,self.ZERO_LENGTH_ALLELE) - if ref[0].upper() != allele[0].upper() and "N" not in (ref[0]+allele[0]).upper(): - self.error(line,self.MISSING_INDEL_ALLELE_REF_BASE) - - # trim trailing bases in alleles - # AH: not certain why trimming this needs to be added - # disabled now for unit testing - # if alt: - # for i in range(1,min(len(ref),min(map(len,alt)))): - # if len(set(allele[-1].upper() for allele in alt)) > 1 or ref[-1].upper() != alt[0][-1].upper(): - # break - # ref, alt = ref[:-1], [allele[:-1] for allele in alt] - - # left-align alleles, if a reference is available - if self._leftalign and self._reference: - while left < pos: - movable = True - for allele in alt: - if len(allele) > len(ref): - longest, shortest = allele, ref - else: - longest, shortest = ref, allele - if len(longest) == len(shortest) or longest[:len(shortest)].upper() != shortest.upper(): - movable = False - if longest[-1].upper() != longest[len(shortest)-1].upper(): - movable = False - if not movable: - break - ref = ref[:-1] - alt = [allele[:-1] for allele in alt] - if min([len(allele) for allele in alt]) == 0 or len(ref) == 0: - ref = faref_leftflank[pos-left-1] + ref - alt = [faref_leftflank[pos-left-1] + allele for allele in alt] - pos -= 1 - - # parse sample columns - samples = [] - for sample in cols[9:]: - dict = {} - values = sample.split(':') - if len(values) > len(format): - self.error(line,self.BAD_NUMBER_OF_VALUES,"(found %s values in element %s; expected %s)" % (len(values),sample,len(format))) - for idx in range(len(format)): - expected = self.get_expected(format[idx], self._format, alt) - if idx < len(values): value = values[idx] - else: - if expected == -1: value = "." - else: value = ",".join(["."]*expected) - - dict[format[idx]] = self.parse_formatdata(format[idx], - value, - self._format, - line) - if expected != -1 and len(dict[format[idx]]) != expected: - self.error(line,self.BAD_NUMBER_OF_PARAMETERS, - "id=%s, expected %s parameters, got %s" % (format[idx],expected,dict[format[idx]])) - if len(dict[format[idx]] ) < expected: dict[format[idx]] += [dict[format[idx]][-1]]*(expected-len(dict[format[idx]])) - dict[format[idx]] = dict[format[idx]][:expected] - samples.append( dict ) - - # done - d = {'chrom':chrom, - 'pos':pos, # return 0-based position - 'id':id, - 'ref':ref, - 'alt':alt, - 'qual':qual, - 'filter':filter, - 'info':info, - 'format':format} - for key,value in zip(self._samples,samples): - d[key] = value - - return d - - - def write_data(self, stream, data): - required = ['chrom','pos','id','ref','alt','qual','filter','info','format'] + self._samples - for k in required: - if k not in data: raise ValueError("Required key %s not found in data" % str(k)) - if data['alt'] == []: alt = "." - else: alt = ",".join(data['alt']) - if data['filter'] == None: filter = "." - elif data['filter'] == []: - if self._version == 33: filter = "0" - else: filter = "PASS" - else: filter = ';'.join(data['filter']) - if data['qual'] == -1: qual = "." - else: qual = str(data['qual']) - - output = [data['chrom'], - str(data['pos']+1), # change to 1-based position - data['id'], - data['ref'], - alt, - qual, - filter, - self.format_formatdata( - data['info'], self._info, separator=";"), - self.format_formatdata( - data['format'], self._format, value=False)] - - for s in self._samples: - output.append(self.format_formatdata( - data[s], self._format, key=False)) - - stream.write( "\t".join(output) + "\n" ) - - def _parse_header(self, stream): - self._lineno = 0 - for line in stream: - line = force_str(line, self.encoding) - self._lineno += 1 - if line.startswith('##'): - self.parse_header(line.strip()) - elif line.startswith('#'): - self.parse_heading(line.strip()) - self.enter_default_format() - else: - break - return line - - def _parse(self, line, stream): - # deal with files with header only - if line.startswith("##"): return - if len(line.strip()) > 0: - d = self.parse_data( line.strip() ) - if d: yield d - for line in stream: - self._lineno += 1 - if self._lines and self._lineno > self._lines: raise StopIteration - d = self.parse_data( line.strip() ) - if d: yield d - - ###################################################################################################### - # - # API follows - # - ###################################################################################################### - - def getsamples(self): - """ List of samples in VCF file """ - return self._samples - - def setsamples(self,samples): - """ List of samples in VCF file """ - self._samples = samples - - def getheader(self): - """ List of header key-value pairs (strings) """ - return self._header - - def setheader(self,header): - """ List of header key-value pairs (strings) """ - self._header = header - - def getinfo(self): - """ Dictionary of ##INFO tags, as VCF.FORMAT values """ - return self._info - - def setinfo(self,info): - """ Dictionary of ##INFO tags, as VCF.FORMAT values """ - self._info = info - - def getformat(self): - """ Dictionary of ##FORMAT tags, as VCF.FORMAT values """ - return self._format - - def setformat(self,format): - """ Dictionary of ##FORMAT tags, as VCF.FORMAT values """ - self._format = format - - def getfilter(self): - """ Dictionary of ##FILTER tags, as VCF.FORMAT values """ - return self._filter - - def setfilter(self,filter): - """ Dictionary of ##FILTER tags, as VCF.FORMAT values """ - self._filter = filter - - def setversion(self, version): - if version != 33 and version != 40: raise ValueError("Can only handle v3.3 and v4.0 VCF files") - self._version = version - - def setregions(self, regions): - self._regions = regions - - def setreference(self, ref): - """ Provide a reference sequence; a Python class supporting a fetch(chromosome, start, end) method, e.g. PySam.FastaFile """ - self._reference = ref - - def ignoreerror(self, errorstring): - try: self._ignored_errors.add(self.__dict__[errorstring]) - except KeyError: raise ValueError("Invalid error string: %s" % errorstring) - - def warnerror(self, errorstring): - try: self._warn_errors.add(self.__dict__[errorstring]) - except KeyError: raise ValueError("Invalid error string: %s" % errorstring) - - def parse(self, stream): - """ Parse a stream of VCF-formatted lines. Initializes class instance and return generator """ - last_line = self._parse_header(stream) - # now return a generator that does the actual work. In this way the pre-processing is done - # before the first piece of data is yielded - return self._parse(last_line, stream) - - def write(self, stream, datagenerator): - """ Writes a VCF file to a stream, using a data generator (or list) """ - self.write_header(stream) - self.write_heading(stream) - for data in datagenerator: self.write_data(stream,data) - - def writeheader(self, stream): - """ Writes a VCF header """ - self.write_header(stream) - self.write_heading(stream) - - def compare_calls(self, pos1, ref1, alt1, pos2, ref2, alt2): - """ Utility function: compares two calls for equality """ - # a variant should always be assigned to a unique position, one base before - # the leftmost position of the alignment gap. If this rule is implemented - # correctly, the two positions must be equal for the calls to be identical. - if pos1 != pos2: return False - # from both calls, trim rightmost bases when identical. Do this safely, i.e. - # only when the reference bases are not Ns - while len(ref1)>0 and len(alt1)>0 and ref1[-1] == alt1[-1]: - ref1 = ref1[:-1] - alt1 = alt1[:-1] - while len(ref2)>0 and len(alt2)>0 and ref2[-1] == alt2[-1]: - ref2 = ref2[:-1] - alt2 = alt2[:-1] - # now, the alternative alleles must be identical - return alt1 == alt2 - -########################################################################################################### -########################################################################################################### -## API functions added by Andreas -########################################################################################################### - - def connect(self, filename, encoding="ascii"): - '''connect to tabix file.''' - self.encoding=encoding - self.tabixfile = pysam.Tabixfile(filename, encoding=encoding) - self._parse_header(self.tabixfile.header) - - def __del__(self): - self.close() - self.tabixfile = None - - def close(self): - if self.tabixfile: - self.tabixfile.close() - self.tabixfile = None - - def fetch(self, - reference=None, - start=None, - end=None, - region=None ): - """ Parse a stream of VCF-formatted lines. - Initializes class instance and return generator """ - return self.tabixfile.fetch( - reference, - start, - end, - region, - parser = asVCFRecord(self)) - - def validate(self, record): - '''validate vcf record. - - returns a validated record. - ''' - - raise NotImplementedError("needs to be checked") - - chrom, pos = record.chrom, record.pos - - # check reference - ref = record.ref - if ref == ".": - self.error(str(record),self.MISSING_REF) - if self._version == 33: ref = get_sequence(chrom,pos,pos+1,self._reference) - else: ref = "" - else: - for c in ref: - if c not in "ACGTN": self.error(str(record),self.UNKNOWN_CHAR_IN_REF) - if "N" in ref: ref = get_sequence(chrom, - pos, - pos+len(ref), - self._reference) - - # make sure reference is sane - if self._reference: - left = max(0,self.pos-100) - faref_leftflank = get_sequence(chrom,left,self.pos+len(ref),self._reference) - faref = faref_leftflank[pos-left:] - if faref != ref: self.error(str(record),self.WRONG_REF,"(reference is %s, VCF says %s)" % (faref,ref)) - ref = faref - - # check: format fields are defined - for f in record.format: - if f not in self._format: self.error(str(record),self.FORMAT_NOT_DEFINED, f) - - # check: all filters are defined - for f in record.filter: - if f not in self._filter: self.error(str(record),self.FILTER_NOT_DEFINED, f) - - # convert v3.3 alleles - if self._version == 33: - if len(ref) != 1: self.error(str(record),self.V33_BAD_REF) - newalts = [] - have_deletions = False - for a in alt: - if len(a) == 1: a = a + ref[1:] # SNP; add trailing reference - elif a.startswith('I'): a = ref[0] + a[1:] + ref[1:] # insertion just beyond pos; add first and trailing reference - elif a.startswith('D'): # allow D and D - have_deletions = True - try: - l = int(a[1:]) # throws ValueError if sequence - if len(ref) < l: # add to reference if necessary - addns = get_sequence(chrom,pos+len(ref),pos+l,self._reference) - ref += addns - for i,na in enumerate(newalts): newalts[i] = na+addns - a = ref[l:] # new deletion, deleting pos...pos+l - except ValueError: - s = a[1:] - if len(ref) < len(s): # add Ns to reference if necessary - addns = get_sequence(chrom,pos+len(ref),pos+len(s),self._reference) - if not s.endswith(addns) and addns != 'N'*len(addns): - self.error(str(record),self.V33_UNMATCHED_DELETION, - "(deletion is %s, reference is %s)" % (a,get_sequence(chrom,pos,pos+len(s),self._reference))) - ref += addns - for i,na in enumerate(newalts): newalts[i] = na+addns - a = ref[len(s):] # new deletion, deleting from pos - else: - self.error(str(record),self.V33_BAD_ALLELE) - newalts.append(a) - alt = newalts - # deletion alleles exist, add dummy 1st reference allele, and account for leading base - if have_deletions: - if pos == 0: - # Petr Danacek's: we can't have a leading nucleotide at (1-based) position 1 - addn = get_sequence(chrom,pos+len(ref),pos+len(ref)+1,self._reference) - ref += addn - alt = [allele+addn for allele in alt] - else: - addn = get_sequence(chrom,pos-1,pos,self._reference) - ref = addn + ref - alt = [addn + allele for allele in alt] - pos -= 1 - else: - # format v4.0 -- just check for nucleotides - for allele in alt: - if not alleleRegEx.match(allele): - self.error(str(record),self.V40_BAD_ALLELE,allele) - - - # check for leading nucleotide in indel calls - for allele in alt: - if len(allele) != len(ref): - if len(allele) == 0: self.error(str(record),self.ZERO_LENGTH_ALLELE) - if ref[0].upper() != allele[0].upper() and "N" not in (ref[0]+allele[0]).upper(): - self.error(str(record),self.MISSING_INDEL_ALLELE_REF_BASE) - - # trim trailing bases in alleles - # AH: not certain why trimming this needs to be added - # disabled now for unit testing - # for i in range(1,min(len(ref),min(map(len,alt)))): - # if len(set(allele[-1].upper() for allele in alt)) > 1 or ref[-1].upper() != alt[0][-1].upper(): - # break - # ref, alt = ref[:-1], [allele[:-1] for allele in alt] - - # left-align alleles, if a reference is available - if self._leftalign and self._reference: - while left < pos: - movable = True - for allele in alt: - if len(allele) > len(ref): - longest, shortest = allele, ref - else: - longest, shortest = ref, allele - if len(longest) == len(shortest) or longest[:len(shortest)].upper() != shortest.upper(): - movable = False - if longest[-1].upper() != longest[len(shortest)-1].upper(): - movable = False - if not movable: - break - ref = ref[:-1] - alt = [allele[:-1] for allele in alt] - if min([len(allele) for allele in alt]) == 0 or len(ref) == 0: - ref = faref_leftflank[pos-left-1] + ref - alt = [faref_leftflank[pos-left-1] + allele for allele in alt] - pos -= 1 - -__all__ = [ - "VCF", "VCFRecord", ] diff --git a/pysam/version.py b/pysam/version.py index 845998a..d76acd3 100644 --- a/pysam/version.py +++ b/pysam/version.py @@ -1,3 +1,4 @@ # pysam versioning information -__version__ = "0.1.0" +__version__ = "0.15.4" +__htslib_version__ = "1.9" diff --git a/requirements.txt b/requirements.txt index 6e8fc44..f937d1c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1 @@ -cython>=0.24.1 +cython>=0.29.12 diff --git a/setup.py b/setup.py index 3e34154..41bb320 100644 --- a/setup.py +++ b/setup.py @@ -28,17 +28,17 @@ import subprocess import sys import sysconfig + from contextlib import contextmanager from setuptools import setup from cy_build import CyExtension as Extension, cy_build_ext as build_ext + try: import cython HAVE_CYTHON = True except ImportError: HAVE_CYTHON = False -IS_PYTHON3 = sys.version_info.major >= 3 - @contextmanager def changedir(path): @@ -213,9 +213,6 @@ def get_pysam_version(): dict(name="pysam.libctabixproxies", sources=[source_pattern % "tabixproxies"] + htslib_sources + os_c_files, libraries=libraries_for_pysam_module), - dict(name="pysam.libcvcf", - sources=[source_pattern % "vcf"] + htslib_sources + os_c_files, - libraries=libraries_for_pysam_module), ] common_options = dict( @@ -256,7 +253,7 @@ def get_pysam_version(): 'classifiers': [_f for _f in classifiers.split("\n") if _f], 'url': "https://github.com/pysam-developers/pysam", 'packages': package_list, - 'requires': ['cython (>=0.21)'], + 'requires': ['cython (>=0.29.12)'], 'ext_modules': [Extension(**opts) for opts in modules], 'cmdclass': cmdclass, 'package_dir': package_dirs, diff --git a/tests/00README.txt b/tests/00README.txt index 67b8689..6e60917 100644 --- a/tests/00README.txt +++ b/tests/00README.txt @@ -1,10 +1,10 @@ File ex1.fa contains two sequences cut from the human genome -build36. They were exatracted with command: +build36. They were extracted with command: samtools faidx human_b36.fa 2:2043966-2045540 20:67967-69550 Sequence names were changed manually for simplicity. File ex1.sam.gz -contains MAQ alignments exatracted with: +contains MAQ alignments extracted with: (samtools view NA18507_maq.bam 2:2044001-2045500; samtools view NA18507_maq.bam 20:68001-69500) @@ -12,15 +12,6 @@ contains MAQ alignments exatracted with: and processed with `samtools fixmate' to make it self-consistent as a standalone alignment. -To try samtools, you may run the following commands: - - samtools faidx ex1.fa # index the reference FASTA - samtools import ex1.fa.fai ex1.sam.gz ex1.bam # SAM->BAM - samtools index ex1.bam # index BAM - samtools tview ex1.bam ex1.fa # view alignment - samtools pileup -cf ex1.fa ex1.bam # pileup and consensus - samtools pileup -cf ex1.fa -t ex1.fa.fai ex1.sam.gz - In order for the script pysam_test.py to work, you will need pysam in your PYTHONPATH. diff --git a/tests/AlignedSegment_test.py b/tests/AlignedSegment_test.py index 40ebb69..cb7c537 100644 --- a/tests/AlignedSegment_test.py +++ b/tests/AlignedSegment_test.py @@ -73,6 +73,12 @@ def testSettingTagInEmptyRead(self): def testCompare(self): '''check comparison functions.''' a = self.build_read() + b = None + + self.assertFalse(a is b) + self.assertFalse(a == b) + self.assertEqual(-1, a.compare(b)) + b = self.build_read() self.assertEqual(0, a.compare(b)) diff --git a/tests/AlignmentFile_test.py b/tests/AlignmentFile_test.py index 381b85f..b4339fa 100644 --- a/tests/AlignmentFile_test.py +++ b/tests/AlignmentFile_test.py @@ -176,9 +176,9 @@ def testARqqual(self): def testPresentOptionalFields(self): self.assertEqual( - self.reads[0].get_tag('NM'), 1, + self.reads[0].opt('NM'), 22, "optional field mismatch in read 1, NM: %s != %s" % - (self.reads[0].get_tag('NM'), 1)) + (self.reads[0].opt('NM'), 22)) self.assertEqual( self.reads[0].get_tag('RG'), 'L1', "optional field mismatch in read 1, RG: %s != %s" % @@ -203,8 +203,8 @@ def testPairedBools(self): self.reads[1].is_proper_pair, True)) def testTags(self): - self.assertEqual(self.reads[0].get_tags(), - [('NM', 1), ('RG', 'L1'), + self.assertEqual(self.reads[0].tags, + [('NM', 22), ('RG', 'L1'), ('PG', 'P1'), ('XT', 'U')]) self.assertEqual(self.reads[1].get_tags(), [('MF', 18), ('RG', 'L2'), @@ -1379,21 +1379,6 @@ def testBAMWholeFile(self): os.unlink(tmpfilename) -class TestDeNovoConstructionUserTags(TestDeNovoConstruction): - - '''test de novo construction with a header that contains lower-case tags.''' - - header = {'HD': {'VN': '1.0'}, - 'SQ': [{'LN': 1575, 'SN': 'chr1'}, - {'LN': 1584, 'SN': 'chr2'}], - 'x1': {'A': 2, 'B': 5}, - 'x3': {'A': 6, 'B': 5}, - 'x2': {'A': 4, 'B': 5}} - - bamfile = os.path.join(BAM_DATADIR, "example_user_header.bam") - samfile = os.path.join(BAM_DATADIR, "example_user_header.sam") - - class TestEmptyHeader(unittest.TestCase): '''see issue 84.''' @@ -1451,7 +1436,7 @@ def testHeader(self): 'ID': 'bwa', 'VN': '0.7.9a-r786', 'CL': 'bwa mem -p -t 8 -M -R ' - '@RG\tID:None\tSM:None\t/mnt/data/hg19.fa\t' + '@RG ID:None SM:None /mnt/data/hg19.fa ' '/mnt/analysis/default-0.fastq'}]}) diff --git a/tests/pysam_data/Makefile b/tests/pysam_data/Makefile index bae50db..3921e8a 100644 --- a/tests/pysam_data/Makefile +++ b/tests/pysam_data/Makefile @@ -3,6 +3,7 @@ BAM=$(SAM:%.sam=%.bam) BAI=$(BAM:%.bam=%.bam.bai) CRAM=ex1.cram ex2.cram ex3.cram CRAI=$(CRAM:%.cram=%.cram.crai) +NO_PG:=$(findstring --no-PG,$(shell samtools view)) # ex2.bam - bam file without index @@ -26,22 +27,22 @@ all: ex1.pileup.gz \ # ex2.sam - as ex1.sam, but with header ex2.sam.gz: ex1.bam ex1.bam.bai - samtools view -h ex1.bam | gzip > ex2.sam.gz + samtools view $(NO_PG) -h ex1.bam | gzip > ex2.sam.gz with_md.sam.gz: ex2.bam ex1.fa - samtools calmd --output-fmt BAM $^ > $@ + samtools calmd $(NO_PG) --output-fmt BAM $^ > $@ #%.bam: %.sam ex1.fa.fai -# samtools import ex1.fa.fai $< $@ +# samtools view $(NO_PG) -bo $@ -t ex1.fa.fai $< uncompressed.bam: ex2.sam - samtools view -buS $< > $@ + samtools view $(NO_PG) -buS $< > $@ %.bam: %.sam - samtools view -bS $< > $@ + samtools view $(NO_PG) -bS $< > $@ %.cram: %.sam - samtools view -bC -T ex1.fa $< > $@ + samtools view $(NO_PG) -bC -T ex1.fa $< > $@ %.cram.crai: %.cram samtools index $< @@ -53,7 +54,7 @@ ex1.fa.fai:ex1.fa samtools faidx ex1.fa ex1.bam:ex1.sam.gz ex1.fa.fai - samtools import ex1.fa.fai ex1.sam.gz ex1.bam + samtools view $(NO_PG) -bo ex1.bam -t ex1.fa.fai ex1.sam.gz %.bam.bai:%.bam samtools index $< @@ -69,11 +70,11 @@ ex1_csi.bam: ex1.bam samtools index -c ex1_csi.bam empty.bam: ex2.sam - grep "^@" $< | samtools view -Sb - > $@ + grep "^@" $< | samtools view $(NO_PG) -Sb - > $@ example_unmapped_reads_no_sq.bam: example_unmapped_reads_no_sq.sam touch tmp.list - samtools import tmp.list $< $@ + samtools view $(NO_PG) -bo $@ -t tmp.list $< rm -f tmp.list example_bai.bam: ex1.bam diff --git a/tests/pysam_data/ex3.sam b/tests/pysam_data/ex3.sam index 7a09188..c575c86 100644 --- a/tests/pysam_data/ex3.sam +++ b/tests/pysam_data/ex3.sam @@ -7,7 +7,7 @@ @PG ID:P2 VN:1.1 @CO this is a comment @CO this is another comment -read_28833_29006_6945 99 chr1 33 20 10M1D25M = 200 167 AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG <<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<< NM:i:1 RG:Z:L1 PG:Z:P1 XT:A:U +read_28833_29006_6945 99 chr1 33 20 10M1D25M = 200 167 AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG <<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<< NM:i:22 RG:Z:L1 PG:Z:P1 XT:A:U read_28701_28881_323b 147 chr2 88 30 35M = 500 412 ACCTATATCTTGGCCTTGGCCGATGCGGCCTTGCA <<<<<;<<<<7;:<<<6;<<<<<<<<<<<<7<<<< MF:i:18 RG:Z:L2 PG:Z:P2 XT:A:R read_28701_28881_323c 147 chr2 88 30 35M = 500 412 ACCTATATCTTGGCCTTGGCCGATGCGGCCTTGCA <<<<<;<<<<7;:<<<6;<<<<<<<<<<<<7<<<< -test_clipped1 99 chr2 997 20 4S6M1D20M5S = 200 167 AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG <<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<< NM:i:1 RG:Z:L1 PG:Z:P1 XT:A:U +test_clipped1 99 chr2 997 20 4S6M1D20M5S = 200 167 AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG <<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<< NM:i:21 RG:Z:L1 PG:Z:P1 XT:A:U diff --git a/tests/pysam_data/example_empty_with_header.sam b/tests/pysam_data/example_empty_with_header.sam index 60f088e..3764780 100644 --- a/tests/pysam_data/example_empty_with_header.sam +++ b/tests/pysam_data/example_empty_with_header.sam @@ -1 +1 @@ -@HD VN:1.3 SO:coordinate +@HD VN:1.3 SO:coordinate diff --git a/tests/pysam_data/example_user_header.sam b/tests/pysam_data/example_user_header.sam deleted file mode 100644 index cebe71d..0000000 --- a/tests/pysam_data/example_user_header.sam +++ /dev/null @@ -1,8 +0,0 @@ -@HD VN:1.0 -@SQ SN:chr1 LN:1575 -@SQ SN:chr2 LN:1584 -@x1 A:2 B:5 -@x2 A:4 B:5 -@x3 A:6 B:5 -read_28833_29006_6945 99 chr1 33 20 10M1D25M = 200 167 AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG <<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<< NM:i:1 RG:Z:L1 -read_28701_28881_323b 147 chr2 88 30 35M = 500 412 ACCTATATCTTGGCCTTGGCCGATGCGGCCTTGCA <<<<<;<<<<7;:<<<6;<<<<<<<<<<<<7<<<< MF:i:18 RG:Z:L2 diff --git a/tests/pysam_data/rg_with_tab.sam b/tests/pysam_data/rg_with_tab.sam index d2cb7bb..2420921 100644 --- a/tests/pysam_data/rg_with_tab.sam +++ b/tests/pysam_data/rg_with_tab.sam @@ -1,6 +1,6 @@ @SQ SN:chr1 LN:1575 @SQ SN:chr2 LN:1584 -@PG ID:bwa PN:bwa VN:0.7.9a-r786 CL:bwa mem -p -t 8 -M -R @RG ID:None SM:None /mnt/data/hg19.fa /mnt/analysis/default-0.fastq +@PG ID:bwa PN:bwa VN:0.7.9a-r786 CL:bwa mem -p -t 8 -M -R @RG ID:None SM:None /mnt/data/hg19.fa /mnt/analysis/default-0.fastq EAS56_57:6:190:289:82 69 chr1 100 0 * = 100 0 CTCAAGGTTGTTGCAAGGGGGTCTATGTGAACAAA <<<7<<<;<<<<<<<<8;;<7;4<;<;;;;;94<; MF:i:192 EAS56_57:6:190:289:82 137 chr1 100 73 35M = 100 0 AGGGGTGCAGAGCCGAGTCACGGGGTTGCCAGCAC <<<<<<;<<<<<<<<<<;<<;<<<<;8<6;9;;2; MF:i:64 Aq:i:0 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 EAS51_64:3:190:727:308 99 chr1 103 99 35M = 263 195 GGTGCAGAGCCGAGTCACGGGGTTGCCAGCACAGG <<<<<<<<<<<<<<<<<<<<<<<<<<<::<<<844 MF:i:18 Aq:i:73 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 diff --git a/tests/samtools_test.py b/tests/samtools_test.py new file mode 100644 index 0000000..4f75499 --- /dev/null +++ b/tests/samtools_test.py @@ -0,0 +1,423 @@ +#!/usr/bin/env python +'''unit testing code for pysam. + +Execute in the :file:`tests` directory as it requires the Makefile +and data files located there. +''' + +import warnings +import unittest +import os +import re +import glob +import sys +import subprocess +import shutil +import pysam +import pysam.samtools +import pysam.bcftools +from TestUtils import checkBinaryEqual, check_lines_equal, \ + check_samtools_view_equal, get_temp_filename, force_bytes, WORKDIR, \ + BAM_DATADIR + + +IS_PYTHON3 = sys.version_info[0] >= 3 + + +def run_command(cmd): + '''run a samtools command''' + try: + retcode = subprocess.call(cmd, shell=True, + stderr=subprocess.PIPE) + if retcode < 0: + print("Child was terminated by signal", -retcode) + except OSError as e: + print("Execution failed:", e) + + +def get_version(executable): + '''return samtools/bcftools version''' + + with subprocess.Popen(executable, shell=True, + stderr=subprocess.PIPE).stderr as pipe: + lines = b"".join(pipe.readlines()) + + if IS_PYTHON3: + lines = lines.decode('ascii') + try: + x = re.search(r"Version:\s+(\S+)", lines).groups()[0] + except AttributeError: + raise ValueError("could not get version from %s" % lines) + return x + + +class SamtoolsTest(unittest.TestCase): + + '''test samtools command line commands and compare + against pysam commands. + + Tests fail, if the output is not binary identical. + ''' + + requisites = [ + "ex1.fa", "ex1.fa.fai", + "ex1.sam.gz", + "ex1.bam", "ex1.bam.bai", + "ex1.sam", + "ex1.sam", + "ex2.bam", + "ex2.sam", + "ex1.bed"] + + # a list of statements to test + # should contain at least one %(out)s component indicating + # an output file. + statements = [ + "view ex1.bam > %(out)s_ex1.view", + "view -c ex1.bam > %(out)s_ex1.count", + # ("view -bT ex1.fa -o %(out)s_ex1.view2 ex1.sam", + "sort ex1.bam -o %(out)s_ex1.sort.bam", + "mpileup ex1.bam > %(out)s_ex1.pileup", + "depth ex1.bam > %(out)s_ex1.depth", + # TODO: issues with file naming + # "faidx ex1.fa; %(out)s_ex1.fa.fai", + "index ex1.bam %(out)s_ex1.bam.fai", + "index -@2 ex1.bam %(out)s_ex1.bam.fai", + "idxstats ex1.bam > %(out)s_ex1.idxstats", + "fixmate ex1.bam %(out)s_ex1.fixmate.bam", + "flagstat ex1.bam > %(out)s_ex1.flagstat", + # Fails python 3.3 on linux, passes on OsX and when + # run locally + "calmd ex1.bam ex1.fa > %(out)s_ex1.calmd.bam", + # use -s option, otherwise the following error in samtools 1.2: + # Samtools-htslib-API: bam_get_library() not yet implemented + # causes downstream problems + # TODO: The following cause subsequent commands to fail + # unknow option + # "rmdup -s ex1.bam %(out)s_ex1.rmdup.bam", + # "merge -f %(out)s_ex1.merge.bam ex1.bam ex1.bam", + "reheader ex2.sam ex1.bam > %(out)s_ex1.reheader.bam", + "cat -o %(out)s_ex1.cat.bam ex1.bam ex1.bam", + "targetcut ex1.bam > %(out)s_ex1.targetcut", + "phase ex1.bam > %(out)s_ex1.phase", + "view -bt ex1.fa.fai ex1.sam.gz > %(out)s_ex1.bam", + "bam2fq ex1.bam > %(out)s_ex1.bam2fq", + # TODO: not the same + # "pad2unpad -T ex1.fa ex2.bam > %(out)s_ex2.unpad", + # TODO: command line option problem + # "bamshuf ex1.bam -O --output-fmt SAM > %(out)s_ex1.bamshuf.sam", + # "collate ex1.bam %(out)s_ex1.collate", + "bedcov ex1.bed ex1.bam > %(out)s_ex1.bedcov", + "stats ex1.bam > %(out)s_ex1.stats", + "dict ex1.bam > %(out)s_ex1.dict", + # TODO: not the same + # ("addreplacerg -r 'RG\tID:ga\tSM:hs' ex1.bam > %(out)s_ex1.addreplacerg", + ] + + map_command = { + "import": "samimport"} + + executable = "samtools" + + module = pysam.samtools + + def check_version(self): + + samtools_version = get_version(self.executable) + + def _r(s): + # patch - remove any of the alpha/beta suffixes, i.e., 0.1.12a -> + # 0.1.12 + if s.count('-') > 0: + s = s[0:s.find('-')] + return re.sub("[^0-9.]", "", s) + + if _r(samtools_version) != _r(pysam.__samtools_version__): + warnings.warn( + "versions of pysam.%s and %s differ: %s != %s" % + (self.executable, + self.executable, + pysam.__samtools_version__, + samtools_version)) + + def setUp(self): + '''setup tests. + + For setup, all commands will be run before the first test is + executed. Individual tests will then just compare the output + files. + + ''' + self.check_version() + + self.workdir = os.path.join(WORKDIR, "samtools_test") + + if not os.path.exists(self.workdir): + os.makedirs(self.workdir) + + for f in self.requisites: + shutil.copy(os.path.join(BAM_DATADIR, f), + os.path.join(self.workdir, f)) + + self.savedir = os.getcwd() + os.chdir(self.workdir) + + return + + def get_command(self, statement, map_to_internal=True): + """return samtools command from statement""" + parts = statement.split(" ") + command = parts[0] + if map_to_internal: + return self.map_command.get(command, command) + else: + return command + + def check_statement(self, statement): + + parts = statement.split(" ") + r_samtools = {"out": self.executable} + r_pysam = {"out": "pysam"} + + command = self.get_command(statement) + + # self.assertTrue(command in pysam.SAMTOOLS_DISPATCH) + + targets = [x for x in parts if "%(out)s" in x] + samtools_targets = [x % r_samtools for x in targets] + pysam_targets = [x % r_pysam for x in targets] + + pysam_method = getattr(self.module, command) + + # run samtools + full_statement = re.sub(r"%\(out\)s", self.executable, statement) + run_command(" ".join((self.executable, full_statement))) + # sys.stdout.write("%s %s ok" % (command, self.executable)) + + # run pysam + if ">" in statement: + assert parts[-2] == ">" + parts = parts[:-2] + + # avoid interpolation to preserve string quoting, tab chars, etc. + pysam_parts = [re.sub(r"%\(out\)s", "pysam", x) for x in parts[1:]] + output = pysam_method(*pysam_parts, + raw=True, + catch_stdout=True) + # sys.stdout.write(" pysam ok\n") + if ">" in statement: + with open(pysam_targets[-1], "wb") as outfile: + if output is not None: + outfile.write(force_bytes(output)) + for samtools_target, pysam_target in zip(samtools_targets, + pysam_targets): + if os.path.isdir(samtools_target): + samtools_files = glob.glob(os.path.join( + samtools_target, "*")) + pysam_files = glob.glob(os.path.join(pysam_target, "*")) + self.assertEqual(len(samtools_files), len(pysam_files)) + # need to be able to exclude files like README, etc. + continue + else: + samtools_files = [samtools_target] + pysam_files = [pysam_target] + + for s, p in zip(samtools_files, pysam_files): + binary_equal = checkBinaryEqual(s, p) + error_msg = "%s failed: files %s and %s are not the same" % ( + command, s, p) + if binary_equal: + continue + elif s.endswith(".bam"): + self.assertTrue( + check_samtools_view_equal( + s, p, without_header=True), + error_msg) + else: + check_lines_equal( + self, s, p, + filter_f=lambda x: x.startswith("#"), + msg=error_msg) + + def testStatements(self): + for statement in self.statements: + command = self.get_command(statement, map_to_internal=False) + # bam2fq differs between version 1.5 and 1.6 - reenable if + # bioconda samtools will be available. + if command in ("bedcov", "stats", "dict", "bam2fq"): + continue + + if (command == "calmd" and + list(sys.version_info[:2]) == [3, 3]): + # skip calmd test, fails only on python 3.3.5 + # in linux (empty output). Works in OsX and passes + # for 3.4 and 3.5, see issue #293 + continue + self.check_statement(statement) + + @unittest.skipIf(sys.platform == "darwin", "not supported, pattern does not match") + @unittest.skipIf(not sys.stdin.isatty(), "skipping usage tests, stdin is not a tty") + def testUsage(self): + if self.executable == "bcftools": + # bcftools usage messages end with exit(1) + return + + for statement in self.statements: + command = self.get_command(statement, map_to_internal=False) + # ignore commands that exit or cause other failures + # TODO: check - if reheader or phase is run in testStatements, sort fails + # here + if command in ("view", "sort", "bam2fq", "flagstat", "reheader", + "stats", "idxstats"): + continue + mapped_command = self.get_command(statement, map_to_internal=True) + pysam_method = getattr(self.module, mapped_command) + usage_msg = pysam_method.usage() + expected = r"Usage:\s+{} {}".format(self.executable, command) + self.assertTrue(re.search(expected, usage_msg) is not None) + + def tearDown(self): + if os.path.exists(self.workdir): + shutil.rmtree(self.workdir) + os.chdir(self.savedir) + + +class EmptyIndexTest(unittest.TestCase): + + def testEmptyIndex(self): + self.assertRaises(IOError, pysam.samtools.index, + "exdoesntexist.bam") + + def testEmptyIndexWithExtraFlag(self): + self.assertRaises(IOError, pysam.samtools.index, + "-c", + "exdoesntexist.bam") + + def testEmptyIndexWithExtraArg(self): + self.assertRaises(IOError, pysam.samtools.index, + "-c", + "-m", + "14", + "exdoesntexist.bam") + + +if sys.platform != "darwin": + # fails with segfault with htslib 1.5 on Osx, an issue with flockfile + # issue seems to be with repeated calls to interface + + class TestReturnType(unittest.TestCase): + + def testReturnValueString(self): + retval = pysam.idxstats(os.path.join(BAM_DATADIR, "ex1.bam")) + if IS_PYTHON3: + self.assertFalse(isinstance(retval, bytes)) + self.assertTrue(isinstance(retval, str)) + else: + self.assertTrue(isinstance(retval, bytes)) + self.assertTrue(isinstance(retval, basestring)) + + def testReturnValueData(self): + args = "-O BAM {}".format(os.path.join(BAM_DATADIR, + "ex1.bam")).split(" ") + retval = pysam.view(*args) + + if IS_PYTHON3: + self.assertTrue(isinstance(retval, bytes)) + self.assertFalse(isinstance(retval, str)) + else: + self.assertTrue(isinstance(retval, bytes)) + self.assertTrue(isinstance(retval, basestring)) + + class StdoutTest(unittest.TestCase): + '''test if stdout can be redirected.''' + + def testWithRedirectedStdout(self): + r = pysam.samtools.flagstat( + os.path.join(BAM_DATADIR, "ex1.bam")) + self.assertTrue(len(r) > 0) + + def testWithoutRedirectedStdout(self): + r = pysam.samtools.flagstat( + os.path.join(BAM_DATADIR, "ex1.bam"), + catch_stdout=False) + self.assertEqual(r, None) + + def testDoubleCalling(self): + # The following would fail if there is an + # issue with stdout being improperly caught. + retvals = pysam.idxstats( + os.path.join(BAM_DATADIR, "ex1.bam")) + retvals = pysam.idxstats( + os.path.join(BAM_DATADIR, "ex1.bam")) + + def testSaveStdout(self): + outfile = get_temp_filename(suffix=".tsv") + r = pysam.samtools.flagstat( + os.path.join(BAM_DATADIR, "ex1.bam"), + save_stdout=outfile) + self.assertEqual(r, None) + with open(outfile) as inf: + r = inf.read() + self.assertTrue(len(r) > 0) + + class PysamTest(SamtoolsTest): + """check access to samtools command in the pysam + main package. + + This is for backwards capability. + """ + + module = pysam + +# class BcftoolsTest(SamtoolsTest): + +# requisites = [ +# "ex1.fa", +# "ex1.vcf.gz", +# "ex1.vcf.gz.tbi", +# ] +# # a list of statements to test +# # should contain at least one %(out)s component indicating +# # an output file. +# statements = [ +# # "index -n ex1.vcf.gz > %(out)s_ex1.index", + +# "annotate -x ID ex1.vcf.gz > %(out)s_ex1.annotate", +# "concat -a ex1.vcf.gz ex1.vcf.gz > %(out)s_ex1.concat", +# "isec -p %(out)s_ex1.isec ex1.vcf.gz ex1.vcf.gz", +# "merge --force-samples ex1.vcf.gz ex1.vcf.gz > %(out)s_ex1.norm", +# "norm -m +both ex1.vcf.gz > %(out)s_ex1.norm", + +# # "plugin", +# # "query -f '%CHROM\n' ex1.vcf.gz > %(out)s_ex1.query", +# # "reheader -s A > %(out)s_ex1.reheader", +# # "view ex1.vcf.gz > %(out)s_ex1.view", +# # "call -m ex1.vcf.gz > %(out)s_ex1.call", +# # bad file descriptor +# # "consensus -f ex1.fa ex1.vcf.gz > %(out)s_ex1.consensus" +# # need appropriate VCF file +# # "cnv", +# # segfault +# # "filter -s A ex1.vcf.gz > %(out)s_ex1.filter", +# # exit +# # "gtcheck -s A ex1.vcf.gz > %(out)s_ex1.gtcheck", +# # segfauld, used to work wit bcftools 1.3 +# # "roh -s A ex1.vcf.gz > %(out)s_ex1.roh", +# "stats ex1.vcf.gz > %(out)s_ex1.stats", +# ] + +# map_command = { +# "import": "samimport"} + +# executable = "bcftools" + +# module = pysam.bcftools + + +if __name__ == "__main__": + # build data files + print("building data files") + subprocess.call("make -C %s" % BAM_DATADIR, shell=True) + print("starting tests") + unittest.main() + print("completed tests")