|
1 | 1 | from collections import defaultdict
|
2 | 2 | from django.db.models import Count, Q, F, prefetch_related_objects
|
| 3 | +from django.contrib.postgres.aggregates import ArrayAgg |
| 4 | +from django.db.models.functions import JSONObject |
3 | 5 |
|
4 |
| -from clickhouse_search.search import get_transcripts_by_key |
| 6 | +from clickhouse_search.search import get_transcripts_by_key, get_annotations_queryset |
5 | 7 | from seqr.models import Individual, IgvSample, AnalysisGroup, DynamicAnalysisGroup, LocusList, VariantTagType,\
|
6 |
| - VariantFunctionalData, FamilyNote, SavedVariant, VariantTag, VariantNote |
| 8 | + VariantFunctionalData, FamilyNote, SavedVariant, VariantTag, VariantNote, Sample |
7 | 9 | from seqr.utils.gene_utils import get_genes
|
8 | 10 | from seqr.utils.logging_utils import SeqrLogger
|
9 | 11 | from seqr.utils.search.utils import backend_specific_call
|
@@ -119,38 +121,60 @@ def families_discovery_tags(families, genome_version, project=None):
|
119 | 121 | families_by_guid = {f['familyGuid']: dict(discoveryTags=[], **f) for f in families}
|
120 | 122 |
|
121 | 123 | family_filter = {'family__project': project} if project else {'family__guid__in': families_by_guid.keys()}
|
122 |
| - discovery_tags_by_key = {v.pop('key'): v for v in SavedVariant.objects.filter( |
| 124 | + discovery_variants = SavedVariant.objects.filter( |
123 | 125 | varianttag__variant_tag_type__category='CMG Discovery Tags', **family_filter,
|
124 |
| - ).values( |
125 |
| - 'key', 'family__guid', selectedMainTranscriptId=F('selected_main_transcript_id'), |
126 |
| - transcripts=F('saved_variant_json__transcripts'), mainTranscriptId=F('saved_variant_json__mainTranscriptId'), |
127 |
| - )} |
| 126 | + ) |
128 | 127 | try:
|
129 |
| - backend_specific_call(lambda *args: None, _add_clickhouse_transcripts)(discovery_tags_by_key, genome_version) |
| 128 | + discovery_tags = backend_specific_call(_get_no_key_tags, _get_clickhouse_tags)( |
| 129 | + discovery_variants, genome_version=genome_version, |
| 130 | + ) |
130 | 131 | except Exception as e:
|
131 | 132 | logger.error(f'Error loading discovery genes from clickhouse: {e}', None)
|
| 133 | + discovery_tags = [] |
132 | 134 |
|
133 | 135 | gene_ids = set()
|
134 |
| - for tag in discovery_tags_by_key.values(): |
| 136 | + for tag in discovery_tags: |
135 | 137 | tag['transcripts'] = tag.get('transcripts') or {}
|
136 | 138 | gene_ids.update(list(tag['transcripts'].keys()))
|
137 |
| - families_by_guid[tag.pop('family__guid')]['discoveryTags'].append(tag) |
| 139 | + families_by_guid[tag.pop('family_guid')]['discoveryTags'].append(tag) |
138 | 140 |
|
139 | 141 | return {
|
140 | 142 | 'familiesByGuid': families_by_guid,
|
141 | 143 | 'genesById': get_genes(gene_ids),
|
142 | 144 | }
|
143 | 145 |
|
144 | 146 |
|
145 |
| -def _add_clickhouse_transcripts(discovery_tags_by_key, genome_version): |
146 |
| - transcripts_by_key = get_transcripts_by_key(genome_version, discovery_tags_by_key.keys()) |
147 |
| - for key, tag in discovery_tags_by_key.items(): |
148 |
| - if key in transcripts_by_key: |
| 147 | +def _get_no_key_tags(discovery_variants, **kwargs): |
| 148 | + return discovery_variants.values( |
| 149 | + family_guid=F('family__guid'), selectedMainTranscriptId=F('selected_main_transcript_id'), |
| 150 | + transcripts=F('saved_variant_json__transcripts'), mainTranscriptId=F('saved_variant_json__mainTranscriptId'), |
| 151 | + ) |
| 152 | + |
| 153 | + |
| 154 | +def _get_clickhouse_tags(discovery_variants, genome_version): |
| 155 | + discovery_tags = list(_get_no_key_tags(discovery_variants.filter(key__isnull=True))) |
| 156 | + |
| 157 | + tags_by_dataset_type = discovery_variants.filter(key__isnull=False).values('dataset_type').annotate( |
| 158 | + keys=ArrayAgg('key', distinct=True), |
| 159 | + tags=ArrayAgg(JSONObject(key='key', family_guid='family__guid', selectedMainTranscriptId='selected_main_transcript_id')), |
| 160 | + ) |
| 161 | + |
| 162 | + for dataset_type, keys, tags in tags_by_dataset_type.values_list('dataset_type', 'keys', 'tags'): |
| 163 | + if dataset_type == Sample.DATASET_TYPE_VARIANT_CALLS: |
| 164 | + transcripts_by_key = get_transcripts_by_key(genome_version, keys) |
| 165 | + else: |
| 166 | + qs = get_annotations_queryset(genome_version, dataset_type, keys) |
| 167 | + transcripts_by_key = dict(qs.values_list('key', qs.transcript_field)) |
| 168 | + for tag in tags: |
| 169 | + key = tag.pop('key') |
149 | 170 | tag['transcripts'] = transcripts_by_key[key]
|
150 |
| - tag['mainTranscriptId'] = next( |
| 171 | + tag['mainTranscriptId'] = next(( |
151 | 172 | t['transcriptId'] for gene_transcripts in tag['transcripts'].values() for t in gene_transcripts
|
152 |
| - if t['transcriptRank'] == 0 |
153 |
| - ) |
| 173 | + if t.get('transcriptRank') == 0 |
| 174 | + ), None) |
| 175 | + discovery_tags.append(tag) |
| 176 | + |
| 177 | + return discovery_tags |
154 | 178 |
|
155 | 179 |
|
156 | 180 | MME_TAG_NAME = 'MME Submission'
|
|
0 commit comments