Skip to content

Commit a41cdf2

Browse files
authored
Merge pull request #5030 from broadinstitute/datset-type-specific-discovery-genes
Dataset type specific discovery genes
2 parents bb3eb46 + 136be63 commit a41cdf2

File tree

2 files changed

+49
-23
lines changed

2 files changed

+49
-23
lines changed

seqr/views/apis/project_api_tests.py

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -390,7 +390,7 @@ def test_project_families(self):
390390
gene_ids = self._assert_expected_project_families(url, response_keys)
391391
self.assertSetEqual(gene_ids, {'ENSG00000135953', 'ENSG00000240361'})
392392

393-
def _assert_expected_project_families(self, url, response_keys):
393+
def _assert_expected_project_families(self, url, response_keys, no_discovery_tags=False):
394394
response = self.client.get(url)
395395
self.assertEqual(response.status_code, 200)
396396

@@ -436,12 +436,14 @@ def _assert_expected_project_families(self, url, response_keys):
436436

437437
self.assertListEqual(family_3['discoveryTags'], [])
438438
self.assertListEqual(empty_family['discoveryTags'], [])
439-
self.assertListEqual(family_1['discoveryTags'], [{
439+
family_1_tags = [] if no_discovery_tags else [{
440440
'transcripts': {'ENSG00000135953': [mock.ANY, mock.ANY, mock.ANY, mock.ANY, mock.ANY, mock.ANY]},
441441
'mainTranscriptId': 'ENST00000258436',
442442
'selectedMainTranscriptId': None,
443-
}])
444-
self.assertListEqual(response_json['familiesByGuid']['F000002_2']['discoveryTags'], [self.DISCOVERY_TAG])
443+
}]
444+
self.assertListEqual(family_1['discoveryTags'], family_1_tags)
445+
family_2_tags = [] if no_discovery_tags else [self.DISCOVERY_TAG]
446+
self.assertListEqual(response_json['familiesByGuid']['F000002_2']['discoveryTags'], family_2_tags)
445447
no_discovery_families = set(response_json['familiesByGuid'].keys()) - {'F000001_1', 'F000002_2'}
446448
self.assertSetEqual({
447449
len(response_json['familiesByGuid'][family_guid]['discoveryTags']) for family_guid in no_discovery_families
@@ -802,8 +804,8 @@ def _assert_expected_project_families(self, *args, **kwargs):
802804
self.reset_logs()
803805
connections['clickhouse'].close()
804806
self.DISCOVERY_TAG = {**DISCOVERY_TAG, 'transcripts': {}}
805-
no_clickhouse_gene_ids = super()._assert_expected_project_families(*args, **kwargs)
806-
self.assertSetEqual(no_clickhouse_gene_ids, {'ENSG00000135953'})
807+
no_clickhouse_gene_ids = super()._assert_expected_project_families(*args, **kwargs, no_discovery_tags=True)
808+
self.assertSetEqual(no_clickhouse_gene_ids, set())
807809
self.assert_json_logs(None, [
808810
("Error loading discovery genes from clickhouse: An error occurred in the current transaction. You can't execute queries until the end of the 'atomic' block.", {
809811
'severity': 'ERROR',

seqr/views/utils/project_context_utils.py

Lines changed: 41 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,11 @@
11
from collections import defaultdict
22
from django.db.models import Count, Q, F, prefetch_related_objects
3+
from django.contrib.postgres.aggregates import ArrayAgg
4+
from django.db.models.functions import JSONObject
35

4-
from clickhouse_search.search import get_transcripts_by_key
6+
from clickhouse_search.search import get_transcripts_by_key, get_annotations_queryset
57
from seqr.models import Individual, IgvSample, AnalysisGroup, DynamicAnalysisGroup, LocusList, VariantTagType,\
6-
VariantFunctionalData, FamilyNote, SavedVariant, VariantTag, VariantNote
8+
VariantFunctionalData, FamilyNote, SavedVariant, VariantTag, VariantNote, Sample
79
from seqr.utils.gene_utils import get_genes
810
from seqr.utils.logging_utils import SeqrLogger
911
from seqr.utils.search.utils import backend_specific_call
@@ -119,38 +121,60 @@ def families_discovery_tags(families, genome_version, project=None):
119121
families_by_guid = {f['familyGuid']: dict(discoveryTags=[], **f) for f in families}
120122

121123
family_filter = {'family__project': project} if project else {'family__guid__in': families_by_guid.keys()}
122-
discovery_tags_by_key = {v.pop('key'): v for v in SavedVariant.objects.filter(
124+
discovery_variants = SavedVariant.objects.filter(
123125
varianttag__variant_tag_type__category='CMG Discovery Tags', **family_filter,
124-
).values(
125-
'key', 'family__guid', selectedMainTranscriptId=F('selected_main_transcript_id'),
126-
transcripts=F('saved_variant_json__transcripts'), mainTranscriptId=F('saved_variant_json__mainTranscriptId'),
127-
)}
126+
)
128127
try:
129-
backend_specific_call(lambda *args: None, _add_clickhouse_transcripts)(discovery_tags_by_key, genome_version)
128+
discovery_tags = backend_specific_call(_get_no_key_tags, _get_clickhouse_tags)(
129+
discovery_variants, genome_version=genome_version,
130+
)
130131
except Exception as e:
131132
logger.error(f'Error loading discovery genes from clickhouse: {e}', None)
133+
discovery_tags = []
132134

133135
gene_ids = set()
134-
for tag in discovery_tags_by_key.values():
136+
for tag in discovery_tags:
135137
tag['transcripts'] = tag.get('transcripts') or {}
136138
gene_ids.update(list(tag['transcripts'].keys()))
137-
families_by_guid[tag.pop('family__guid')]['discoveryTags'].append(tag)
139+
families_by_guid[tag.pop('family_guid')]['discoveryTags'].append(tag)
138140

139141
return {
140142
'familiesByGuid': families_by_guid,
141143
'genesById': get_genes(gene_ids),
142144
}
143145

144146

145-
def _add_clickhouse_transcripts(discovery_tags_by_key, genome_version):
146-
transcripts_by_key = get_transcripts_by_key(genome_version, discovery_tags_by_key.keys())
147-
for key, tag in discovery_tags_by_key.items():
148-
if key in transcripts_by_key:
147+
def _get_no_key_tags(discovery_variants, **kwargs):
148+
return discovery_variants.values(
149+
family_guid=F('family__guid'), selectedMainTranscriptId=F('selected_main_transcript_id'),
150+
transcripts=F('saved_variant_json__transcripts'), mainTranscriptId=F('saved_variant_json__mainTranscriptId'),
151+
)
152+
153+
154+
def _get_clickhouse_tags(discovery_variants, genome_version):
155+
discovery_tags = list(_get_no_key_tags(discovery_variants.filter(key__isnull=True)))
156+
157+
tags_by_dataset_type = discovery_variants.filter(key__isnull=False).values('dataset_type').annotate(
158+
keys=ArrayAgg('key', distinct=True),
159+
tags=ArrayAgg(JSONObject(key='key', family_guid='family__guid', selectedMainTranscriptId='selected_main_transcript_id')),
160+
)
161+
162+
for dataset_type, keys, tags in tags_by_dataset_type.values_list('dataset_type', 'keys', 'tags'):
163+
if dataset_type == Sample.DATASET_TYPE_VARIANT_CALLS:
164+
transcripts_by_key = get_transcripts_by_key(genome_version, keys)
165+
else:
166+
qs = get_annotations_queryset(genome_version, dataset_type, keys)
167+
transcripts_by_key = dict(qs.values_list('key', qs.transcript_field))
168+
for tag in tags:
169+
key = tag.pop('key')
149170
tag['transcripts'] = transcripts_by_key[key]
150-
tag['mainTranscriptId'] = next(
171+
tag['mainTranscriptId'] = next((
151172
t['transcriptId'] for gene_transcripts in tag['transcripts'].values() for t in gene_transcripts
152-
if t['transcriptRank'] == 0
153-
)
173+
if t.get('transcriptRank') == 0
174+
), None)
175+
discovery_tags.append(tag)
176+
177+
return discovery_tags
154178

155179

156180
MME_TAG_NAME = 'MME Submission'

0 commit comments

Comments
 (0)