-
Notifications
You must be signed in to change notification settings - Fork 37
VCF support for RLE and fix to same-as-reference allele normalization to RLE #589
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
6b36ba1
76cd565
4f22083
94cc277
d048501
6051b11
c627c50
499002e
4db6c3c
3e121be
eac0dc5
491de1b
b1a7b5f
bce957a
c0042c3
6df2ba2
460d6bb
b7fa124
576e4db
70bd6e2
adf5f73
2c5c1dd
9a65b82
94ad735
7723d5c
6255779
26eeba3
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||
|---|---|---|---|---|
|
|
@@ -16,7 +16,7 @@ | |||
| from ga4gh.vrs import VRS_VERSION, __version__ | ||||
| from ga4gh.vrs.dataproxy import _DataProxy | ||||
| from ga4gh.vrs.extras.translator import AlleleTranslator | ||||
| from ga4gh.vrs.models import Allele | ||||
| from ga4gh.vrs.models import Allele, Range, sequenceString | ||||
|
|
||||
| _logger = logging.getLogger(__name__) | ||||
|
|
||||
|
|
@@ -36,17 +36,10 @@ class FieldName(str, Enum): | |||
| STARTS_FIELD = "VRS_Starts" | ||||
| ENDS_FIELD = "VRS_Ends" | ||||
| STATES_FIELD = "VRS_States" | ||||
| LENGTHS_FIELD = "VRS_Lengths" | ||||
| REPEAT_SUBUNIT_LENGTHS_FIELD = "VRS_RepeatSubunitLengths" | ||||
| ERROR_FIELD = "VRS_Error" | ||||
|
|
||||
| def default_value(self) -> Literal[".", -1]: | ||||
| """Provide value to use for default/null case in VCF INFO field | ||||
|
|
||||
| :return: either ``"."`` or ``-1`` | ||||
| """ | ||||
| if self in (FieldName.IDS_FIELD, FieldName.STATES_FIELD, FieldName.ERROR_FIELD): | ||||
| return "." | ||||
| return -1 | ||||
|
|
||||
|
|
||||
| # VCF character escape map | ||||
| VCF_ESCAPE_MAP = str.maketrans( | ||||
|
|
@@ -59,6 +52,19 @@ def default_value(self) -> Literal[".", -1]: | |||
| } | ||||
| ) | ||||
|
|
||||
| # Short State.sequence will be included in output VCF if <= this value, | ||||
| # otherwise output will be emitted as the "." character. | ||||
| MAX_LITERAL_STATE_LENGTH = 15 | ||||
|
|
||||
|
|
||||
| def _sequence_value_for_info(sequence: str | sequenceString) -> str | None: | ||||
| """Return a literal sequence suitable for INFO fields if it is short enough.""" | ||||
| if not sequence: | ||||
| return None | ||||
| seq = getattr(sequence, "root", sequence) | ||||
| seq_str = str(seq) | ||||
| return seq_str if len(seq_str) <= MAX_LITERAL_STATE_LENGTH else None | ||||
|
|
||||
|
|
||||
| def dump_alleles_to_pkl(alleles: list[Allele], output_pkl_path: Path) -> None: | ||||
| """Create pkl file of dictionary mapping VRS IDs to ingested alleles. | ||||
|
|
@@ -187,6 +193,24 @@ def _update_vcf_header( | |||
| f"corresponding to the GT indexes of the {info_field_desc} alleles" | ||||
| ), | ||||
| ) | ||||
| vcf.header.info.add( | ||||
| FieldName.LENGTHS_FIELD.value, | ||||
| info_field_num, | ||||
| "Integer", | ||||
| ( | ||||
| "The length values from ReferenceLengthExpression states for the GA4GH VRS " | ||||
| f"Alleles corresponding to the GT indexes of the {info_field_desc} alleles" | ||||
| ), | ||||
| ) | ||||
| vcf.header.info.add( | ||||
| FieldName.REPEAT_SUBUNIT_LENGTHS_FIELD.value, | ||||
| info_field_num, | ||||
| "Integer", | ||||
| ( | ||||
| "The repeat subunit length values from ReferenceLengthExpression states for the GA4GH VRS " | ||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can this be |
||||
| f"Alleles corresponding to the GT indexes of the {info_field_desc} alleles" | ||||
| ), | ||||
| ) | ||||
|
|
||||
| @use_ga4gh_compute_identifier_when(VrsObjectIdentifierIs.MISSING) | ||||
| def annotate( | ||||
|
|
@@ -244,6 +268,8 @@ def annotate( | |||
| FieldName.STARTS_FIELD, | ||||
| FieldName.ENDS_FIELD, | ||||
| FieldName.STATES_FIELD, | ||||
| FieldName.LENGTHS_FIELD, | ||||
| FieldName.REPEAT_SUBUNIT_LENGTHS_FIELD, | ||||
| ] | ||||
| else: | ||||
| # no INFO field names need to be designated if not producing an annotated VCF | ||||
|
|
@@ -275,8 +301,10 @@ def annotate( | |||
|
|
||||
| if output_vcf_path and vcf_out: | ||||
| for k in additional_info_fields: | ||||
| # Convert "" and None values (but not 0) to None. | ||||
| # Pysam outputs "." for missing values. | ||||
| record.info[k.value] = [ | ||||
| value or k.default_value() for value in vrs_field_data[k.value] | ||||
| None if v in ("", None) else v for v in vrs_field_data[k.value] | ||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. double checking -- does this convert sequences that are an empty string into |
||||
| ] | ||||
| vcf_out.write(record) | ||||
|
|
||||
|
|
@@ -369,20 +397,49 @@ def _get_vrs_object( | |||
| vrs_field_data[FieldName.IDS_FIELD].append(allele_id) | ||||
|
|
||||
| if vrs_attributes: | ||||
| # Initialize fields with None for missing values | ||||
| # pysam will convert None to "." in VCF output | ||||
| start = end = None | ||||
| alt = None | ||||
| length = repeat_subunit_length = None | ||||
|
|
||||
| if vrs_obj: | ||||
| # Common fields for all state types | ||||
| start = vrs_obj.location.start | ||||
| end = vrs_obj.location.end | ||||
| alt = ( | ||||
| str(vrs_obj.state.sequence.root) | ||||
| if vrs_obj.state.sequence | ||||
| else "" | ||||
| ) | ||||
| else: | ||||
| start = end = alt = "" | ||||
| state = vrs_obj.state | ||||
|
|
||||
| # State-specific fields | ||||
| state_type = state.type | ||||
| alt = _sequence_value_for_info(getattr(state, "sequence", None)) | ||||
|
|
||||
| if state_type in ( | ||||
| "ReferenceLengthExpression", | ||||
| "LengthExpression", | ||||
| ): | ||||
| # For RLE and LE, populate length | ||||
| length = state.length | ||||
| if length is None: | ||||
| err_msg = f"{state_type} requires a non-empty length: {vcf_coords}" | ||||
| raise VcfAnnotatorError(err_msg) | ||||
| if isinstance(length, Range): | ||||
| err_msg = f"{state_type} with Range length not supported for VCF annotation: {vcf_coords}" | ||||
| raise VcfAnnotatorError(err_msg) | ||||
| # For RLE only, also populate repeatSubunitLength | ||||
| if state_type == "ReferenceLengthExpression": | ||||
| repeat_subunit_length = state.repeatSubunitLength | ||||
| else: | ||||
| if state_type != "LiteralSequenceExpression": | ||||
| err_msg = f"Unsupported state type '{state_type}' for VCF annotation: {vcf_coords}" | ||||
| raise VcfAnnotatorError(err_msg) | ||||
|
|
||||
| vrs_field_data[FieldName.STARTS_FIELD].append(start) | ||||
| vrs_field_data[FieldName.ENDS_FIELD].append(end) | ||||
| vrs_field_data[FieldName.STATES_FIELD].append(alt) | ||||
| vrs_field_data[FieldName.LENGTHS_FIELD].append(length) | ||||
| vrs_field_data[FieldName.REPEAT_SUBUNIT_LENGTHS_FIELD].append( | ||||
| repeat_subunit_length | ||||
| ) | ||||
|
|
||||
| def _get_vrs_data( | ||||
| self, | ||||
|
|
@@ -444,7 +501,7 @@ def _get_vrs_data( | |||
| allele_collection, | ||||
| vrs_field_data, | ||||
| assembly, | ||||
| vrs_data_key=data, | ||||
| vrs_data_key=data, # TODO unused? | ||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Might be okay to remove. It appears that this was used historically to add individual alleles as a key into an accumulated dictionary of everything in the VCF. Doesn't seem to get used anymore, and it feels clumsy to be using a tab-separated string rather than a tuple or something anyway.
|
||||
| vrs_attributes=vrs_attributes, | ||||
| require_validation=require_validation, | ||||
| ) | ||||
|
|
||||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
placeholder for thinking through implications of this, briefly mentioned on slack