Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
6b36ba1
Initial revision of ReferenceLengthExpression in VCF annotator
theferrit32 Oct 15, 2025
76cd565
No longer need FieldName.default_value since we can rely on pysam
theferrit32 Oct 15, 2025
4f22083
Fix INFO field setting to allow 0 for Integers.
theferrit32 Oct 16, 2025
94cc277
Reworking the RLE flow
theferrit32 Oct 16, 2025
d048501
Move around code some more
theferrit32 Oct 16, 2025
6051b11
Adding some tests
theferrit32 Oct 22, 2025
c627c50
Add more RLE test cases
theferrit32 Nov 11, 2025
499002e
Fix test_allele_translator for RLE NC_000013.11:g.32936732=
theferrit32 Nov 11, 2025
4db6c3c
remove id/digest testing from allele test
theferrit32 Nov 12, 2025
3e121be
Fix test_annotate_vcf_grch38_noattrs with ref alleles
theferrit32 Nov 12, 2025
eac0dc5
Update incorrect ref allele test case. Refresh expected VCFs
theferrit32 Nov 12, 2025
491de1b
Refresh tests/cassettes/test_normalize_allele.yaml
theferrit32 Nov 12, 2025
b1a7b5f
Update comment
theferrit32 Nov 12, 2025
bce957a
Change test VCFs to uncompressed text so diffs can be seen going forw…
theferrit32 Nov 12, 2025
c0042c3
Include sequence value in output vcf if the length of that sequence i…
theferrit32 Nov 12, 2025
6df2ba2
Remove commented blocks
theferrit32 Nov 12, 2025
460d6bb
Update test name
theferrit32 Nov 12, 2025
b7fa124
Add comments to cases in test_vrs_normalize.py
theferrit32 Nov 12, 2025
576e4db
Add make target clean-cassettes
theferrit32 Nov 12, 2025
70bd6e2
Minor tweak to all_played checks in vcf annotator tests to make re-re…
theferrit32 Nov 12, 2025
adf5f73
Ignore VRS and Python version header values in VCF tests
theferrit32 Nov 12, 2025
2c5c1dd
Add missing cassette
theferrit32 Nov 13, 2025
9a65b82
Don't fail test when vcr is being rewritten. Refresh cassettes
theferrit32 Nov 13, 2025
94ad735
add decode_compressed_response=True to vcr config
theferrit32 Nov 13, 2025
7723d5c
Add vcr_config to conftest.py. Delete vcr_support.py. Refresh cassettes
theferrit32 Nov 13, 2025
6255779
Merge remote-tracking branch 'origin/main' into issue-577-VCF-RLE
theferrit32 Nov 17, 2025
26eeba3
Reset casettes for RLE tests after merge from main
theferrit32 Nov 18, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,11 @@ cleaner: clean
cleanest: cleaner
rm -fr .eggs venv

#=> clean-cassettes: delete YAML VCR cassettes under tests/**/casette/
.PHONY: clean-cassettes
clean-cassettes:
find ./tests -type f -path '*/cassettes/*.yaml' -print0 | ${XRM}


## <LICENSE>
## Copyright 2016 Source Code Committers
Expand Down
95 changes: 76 additions & 19 deletions src/ga4gh/vrs/extras/annotator/vcf.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
from ga4gh.vrs import VRS_VERSION, __version__
from ga4gh.vrs.dataproxy import _DataProxy
from ga4gh.vrs.extras.translator import AlleleTranslator
from ga4gh.vrs.models import Allele
from ga4gh.vrs.models import Allele, Range, sequenceString

_logger = logging.getLogger(__name__)

Expand All @@ -36,17 +36,10 @@ class FieldName(str, Enum):
STARTS_FIELD = "VRS_Starts"
ENDS_FIELD = "VRS_Ends"
STATES_FIELD = "VRS_States"
LENGTHS_FIELD = "VRS_Lengths"
REPEAT_SUBUNIT_LENGTHS_FIELD = "VRS_RepeatSubunitLengths"
ERROR_FIELD = "VRS_Error"

def default_value(self) -> Literal[".", -1]:
"""Provide value to use for default/null case in VCF INFO field

:return: either ``"."`` or ``-1``
"""
if self in (FieldName.IDS_FIELD, FieldName.STATES_FIELD, FieldName.ERROR_FIELD):
return "."
return -1


# VCF character escape map
VCF_ESCAPE_MAP = str.maketrans(
Expand All @@ -59,6 +52,19 @@ def default_value(self) -> Literal[".", -1]:
}
)

# Short State.sequence will be included in output VCF if <= this value,
# otherwise output will be emitted as the "." character.
MAX_LITERAL_STATE_LENGTH = 15
Comment on lines +55 to +57
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

placeholder for thinking through implications of this, briefly mentioned on slack



def _sequence_value_for_info(sequence: str | sequenceString) -> str | None:
"""Return a literal sequence suitable for INFO fields if it is short enough."""
if not sequence:
return None
seq = getattr(sequence, "root", sequence)
seq_str = str(seq)
return seq_str if len(seq_str) <= MAX_LITERAL_STATE_LENGTH else None


def dump_alleles_to_pkl(alleles: list[Allele], output_pkl_path: Path) -> None:
"""Create pkl file of dictionary mapping VRS IDs to ingested alleles.
Expand Down Expand Up @@ -187,6 +193,24 @@ def _update_vcf_header(
f"corresponding to the GT indexes of the {info_field_desc} alleles"
),
)
vcf.header.info.add(
FieldName.LENGTHS_FIELD.value,
info_field_num,
"Integer",
(
"The length values from ReferenceLengthExpression states for the GA4GH VRS "
f"Alleles corresponding to the GT indexes of the {info_field_desc} alleles"
),
)
vcf.header.info.add(
FieldName.REPEAT_SUBUNIT_LENGTHS_FIELD.value,
info_field_num,
"Integer",
(
"The repeat subunit length values from ReferenceLengthExpression states for the GA4GH VRS "
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can this be repeat_subunit_length or repeatSubunitLength or something just to make it clear that it's a specific property

f"Alleles corresponding to the GT indexes of the {info_field_desc} alleles"
),
)

@use_ga4gh_compute_identifier_when(VrsObjectIdentifierIs.MISSING)
def annotate(
Expand Down Expand Up @@ -244,6 +268,8 @@ def annotate(
FieldName.STARTS_FIELD,
FieldName.ENDS_FIELD,
FieldName.STATES_FIELD,
FieldName.LENGTHS_FIELD,
FieldName.REPEAT_SUBUNIT_LENGTHS_FIELD,
]
else:
# no INFO field names need to be designated if not producing an annotated VCF
Expand Down Expand Up @@ -275,8 +301,10 @@ def annotate(

if output_vcf_path and vcf_out:
for k in additional_info_fields:
# Convert "" and None values (but not 0) to None.
# Pysam outputs "." for missing values.
record.info[k.value] = [
value or k.default_value() for value in vrs_field_data[k.value]
None if v in ("", None) else v for v in vrs_field_data[k.value]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

double checking -- does this convert sequences that are an empty string into None/"."?

]
vcf_out.write(record)

Expand Down Expand Up @@ -369,20 +397,49 @@ def _get_vrs_object(
vrs_field_data[FieldName.IDS_FIELD].append(allele_id)

if vrs_attributes:
# Initialize fields with None for missing values
# pysam will convert None to "." in VCF output
start = end = None
alt = None
length = repeat_subunit_length = None

if vrs_obj:
# Common fields for all state types
start = vrs_obj.location.start
end = vrs_obj.location.end
alt = (
str(vrs_obj.state.sequence.root)
if vrs_obj.state.sequence
else ""
)
else:
start = end = alt = ""
state = vrs_obj.state

# State-specific fields
state_type = state.type
alt = _sequence_value_for_info(getattr(state, "sequence", None))

if state_type in (
"ReferenceLengthExpression",
"LengthExpression",
):
# For RLE and LE, populate length
length = state.length
if length is None:
err_msg = f"{state_type} requires a non-empty length: {vcf_coords}"
raise VcfAnnotatorError(err_msg)
if isinstance(length, Range):
err_msg = f"{state_type} with Range length not supported for VCF annotation: {vcf_coords}"
raise VcfAnnotatorError(err_msg)
# For RLE only, also populate repeatSubunitLength
if state_type == "ReferenceLengthExpression":
repeat_subunit_length = state.repeatSubunitLength
else:
if state_type != "LiteralSequenceExpression":
err_msg = f"Unsupported state type '{state_type}' for VCF annotation: {vcf_coords}"
raise VcfAnnotatorError(err_msg)

vrs_field_data[FieldName.STARTS_FIELD].append(start)
vrs_field_data[FieldName.ENDS_FIELD].append(end)
vrs_field_data[FieldName.STATES_FIELD].append(alt)
vrs_field_data[FieldName.LENGTHS_FIELD].append(length)
vrs_field_data[FieldName.REPEAT_SUBUNIT_LENGTHS_FIELD].append(
repeat_subunit_length
)

def _get_vrs_data(
self,
Expand Down Expand Up @@ -444,7 +501,7 @@ def _get_vrs_data(
allele_collection,
vrs_field_data,
assembly,
vrs_data_key=data,
vrs_data_key=data, # TODO unused?
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Might be okay to remove. It appears that this was used historically to add individual alleles as a key into an accumulated dictionary of everything in the VCF. Doesn't seem to get used anymore, and it feels clumsy to be using a tab-separated string rather than a tuple or something anyway.

key = vrs_data_key if vrs_data_key else vcf_coords

vrs_attributes=vrs_attributes,
require_validation=require_validation,
)
Expand Down
36 changes: 23 additions & 13 deletions src/ga4gh/vrs/normalize.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,19 +134,29 @@ def _normalize_allele(input_allele, data_proxy, rle_seq_limit=50):
trim_ival, trim_alleles = _normalize(
ref_seq, ival, alleles, mode=None, trim=True
)
except ValueError:
# Occurs for ref agree Alleles (when alt = ref)
len_trimmed_ref = len_trimmed_alt = 0
# TODO: Return RLE for ref agree Alleles
else:
trim_ref_seq = ref_seq[trim_ival[0] : trim_ival[1]]
trim_alt_seq = trim_alleles[1]
len_trimmed_ref = len(trim_ref_seq)
len_trimmed_alt = len(trim_alt_seq)

# Compare the two allele sequences
if not len_trimmed_ref and not len_trimmed_alt:
return input_allele
except ValueError as e:
# bioutils _normalize raises ValueError for reference alleles when in trim=True.
# But verify this is actually a reference allele before treating it as such.
ref_at_location = ref_seq[start.value : end.value]
alt_seq = alleles[1]
if ref_at_location == alt_seq:
# Return RLE with length and repeatSubunitLength both set to ref (and alt) length
new_allele = pydantic_copy(input_allele)
return _define_rle_allele(
new_allele,
length=len(ref_at_location),
repeat_subunit_length=len(ref_at_location),
rle_seq_limit=rle_seq_limit,
extended_alt_seq=ref_at_location,
)
# Re-raise if this is a different ValueError (shouldn't happen with valid input and assuming bioutils hasn't changed behavior)
msg = f"Unexpected bioutils trim error for non reference allele: ref='{ref_at_location}', alt='{alt_seq}'"
raise ValueError(msg) from e

trim_ref_seq = ref_seq[trim_ival[0] : trim_ival[1]]
trim_alt_seq = trim_alleles[1]
len_trimmed_ref = len(trim_ref_seq)
len_trimmed_alt = len(trim_alt_seq)

new_allele = pydantic_copy(input_allele)

Expand Down
12 changes: 12 additions & 0 deletions tests/cassettes/test_normalize_allele.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -273,4 +273,16 @@ interactions:
status:
code: 200
message: OK
- request:
body: null
headers: {}
method: GET
uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=100210777&end=100210779
response:
body:
string: AA
headers: {}
status:
code: 200
message: OK
version: 1
Loading
Loading