Skip to content

Commit ac2156a

Browse files
authored
Merge pull request #5042 from broadinstitute/exclude-current-search-results
Exclude current search results from next search
2 parents e3e7df1 + 2337851 commit ac2156a

File tree

12 files changed

+180
-34
lines changed

12 files changed

+180
-34
lines changed

clickhouse_search/backend/functions.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,12 @@
77
class Array(Func):
88
function = 'array'
99

10+
def _resolve_output_field(self):
11+
output_field = super()._resolve_output_field()
12+
if not isinstance(output_field, ArrayField):
13+
output_field = ArrayField(base_field=output_field)
14+
return output_field
15+
1016

1117
class ArrayConcat(Func):
1218
function = 'arrayConcat'

clickhouse_search/managers.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -726,9 +726,12 @@ def clinvar_model(self):
726726
def genome_version(self):
727727
return self.annotations_model.ANNOTATION_CONSTANTS['genomeVersion']
728728

729-
def search(self, sample_data, freqs=None, annotations=None, **kwargs):
729+
def search(self, sample_data, freqs=None, annotations=None, exclude_keys=None, **kwargs):
730730
entries = self.filter_locus(**kwargs)
731731

732+
if exclude_keys:
733+
entries = entries.exclude(key__in=exclude_keys)
734+
732735
entries = self._join_annotations(entries)
733736

734737
is_sv_class = 'cn' in self.call_fields

clickhouse_search/search.py

Lines changed: 15 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,8 @@ def get_clickhouse_variants(samples, search, user, previous_search_results, geno
3434
family_guid = None
3535
inheritance_mode = search.get('inheritance_mode')
3636
has_comp_het = inheritance_mode in {RECESSIVE, COMPOUND_HET}
37+
exclude_keys = search.pop('exclude_keys', None) or {}
38+
exclude_key_pairs = search.pop('exclude_key_pairs', None) or {}
3739
for dataset_type, sample_data in sample_data_by_dataset_type.items():
3840
logger.info(f'Loading {dataset_type} data for {len(set(sample_data["family_guids"]))} families', user)
3941

@@ -44,13 +46,13 @@ def get_clickhouse_variants(samples, search, user, previous_search_results, geno
4446

4547
dataset_results = []
4648
if inheritance_mode != COMPOUND_HET:
47-
result_q = _get_search_results_queryset(entry_cls, annotations_cls, sample_data, skip_individual_guid=is_multi_project, **search)
49+
result_q = _get_search_results_queryset(entry_cls, annotations_cls, sample_data, skip_individual_guid=is_multi_project, exclude_keys=exclude_keys.get(dataset_type), **search)
4850
dataset_results += _evaluate_results(result_q)
4951
if has_comp_het:
5052
comp_het_sample_data = sample_data
5153
if is_multi_project and dataset_type == Sample.DATASET_TYPE_VARIANT_CALLS and _is_x_chrom_only(genome_version, **search):
5254
comp_het_sample_data = _no_affected_male_families(sample_data, user)
53-
result_q = _get_data_type_comp_het_results_queryset(entry_cls, annotations_cls, comp_het_sample_data, skip_individual_guid=is_multi_project, **search)
55+
result_q = _get_data_type_comp_het_results_queryset(entry_cls, annotations_cls, comp_het_sample_data, exclude_key_pairs=exclude_key_pairs.get(dataset_type), skip_individual_guid=is_multi_project, **search)
5456
dataset_results += _evaluate_results(result_q, is_comp_het=True)
5557

5658
if is_multi_project:
@@ -60,7 +62,7 @@ def get_clickhouse_variants(samples, search, user, previous_search_results, geno
6062
if has_comp_het and Sample.DATASET_TYPE_VARIANT_CALLS in sample_data_by_dataset_type and any(
6163
dataset_type.startswith(Sample.DATASET_TYPE_SV_CALLS) for dataset_type in sample_data_by_dataset_type
6264
):
63-
results += _get_multi_data_type_comp_het_results_queryset(genome_version, sample_data_by_dataset_type, user, **search)
65+
results += _get_multi_data_type_comp_het_results_queryset(genome_version, sample_data_by_dataset_type, user, exclude_key_pairs, **search)
6466

6567
cache_results = get_clickhouse_cache_results(results, sort, family_guid)
6668
previous_search_results.update(cache_results)
@@ -83,7 +85,7 @@ def _evaluate_results(result_q, is_comp_het=False):
8385
raise InvalidSearchException('This search returned too many results')
8486
return results
8587

86-
def _get_multi_data_type_comp_het_results_queryset(genome_version, sample_data_by_dataset_type, user, annotations=None, annotations_secondary=None, inheritance_mode=None, **search_kwargs):
88+
def _get_multi_data_type_comp_het_results_queryset(genome_version, sample_data_by_dataset_type, user, exclude_key_pairs, annotations=None, annotations_secondary=None, inheritance_mode=None, **search_kwargs):
8789
if annotations_secondary:
8890
annotations = {
8991
**annotations,
@@ -124,7 +126,7 @@ def _get_multi_data_type_comp_het_results_queryset(genome_version, sample_data_b
124126
sv_annotations_cls = ANNOTATIONS_CLASS_MAP[genome_version][sv_dataset_type]
125127
sv_q = sv_annotations_cls.objects.subquery_join(sv_entries).search(**search_kwargs, annotations=annotations)
126128

127-
result_q = _get_comp_het_results_queryset(annotations_cls, snv_indel_q, sv_q, len(families))
129+
result_q = _get_comp_het_results_queryset(annotations_cls, snv_indel_q, sv_q, len(families), exclude_key_pairs.get(f'{Sample.DATASET_TYPE_VARIANT_CALLS},{sv_dataset_type}'))
128130
dataset_results = _evaluate_results(result_q, is_comp_het=True)
129131
if skip_individual_guid:
130132
_add_individual_guids(dataset_results, sv_sample_data)
@@ -133,7 +135,7 @@ def _get_multi_data_type_comp_het_results_queryset(genome_version, sample_data_b
133135
return results
134136

135137

136-
def _get_data_type_comp_het_results_queryset(entry_cls, annotations_cls, sample_data, annotations=None, annotations_secondary=None, inheritance_mode=None, **search_kwargs):
138+
def _get_data_type_comp_het_results_queryset(entry_cls, annotations_cls, sample_data, annotations=None, annotations_secondary=None, inheritance_mode=None, exclude_key_pairs=None, **search_kwargs):
137139
entries = entry_cls.objects.search(
138140
sample_data, **search_kwargs, inheritance_mode=COMPOUND_HET, annotations=annotations, annotate_carriers=True,
139141
)
@@ -143,10 +145,10 @@ def _get_data_type_comp_het_results_queryset(entry_cls, annotations_cls, sample_
143145
annotations=annotations_secondary or annotations, **search_kwargs,
144146
)
145147

146-
return _get_comp_het_results_queryset(annotations_cls, primary_q, secondary_q, len(set(sample_data['family_guids'])))
148+
return _get_comp_het_results_queryset(annotations_cls, primary_q, secondary_q, len(set(sample_data['family_guids'])), exclude_key_pairs)
147149

148150

149-
def _get_comp_het_results_queryset(annotations_cls, primary_q, secondary_q, num_families):
151+
def _get_comp_het_results_queryset(annotations_cls, primary_q, secondary_q, num_families, exclude_key_pairs):
150152
results = annotations_cls.objects.search_compound_hets(primary_q, secondary_q)
151153

152154
if results.has_annotation('primary_carriers') and results.has_annotation('secondary_carriers'):
@@ -206,9 +208,12 @@ def _get_comp_het_results_queryset(annotations_cls, primary_q, secondary_q, num_
206208
**genotype_expressions,
207209
)
208210

209-
return results.annotate(
211+
pair_results = results.annotate(
210212
pair_key=ArraySort(Array('primary_key', 'secondary_key')),
211-
).distinct('pair_key').values_list(
213+
).distinct('pair_key')
214+
if exclude_key_pairs:
215+
pair_results = pair_results.exclude(pair_key__in=exclude_key_pairs)
216+
return pair_results.values_list(
212217
'pair_key',
213218
_result_as_tuple(results, 'primary_'),
214219
_result_as_tuple(results, 'secondary_'),

clickhouse_search/search_tests.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -503,6 +503,34 @@ def test_inheritance_filter(self):
503503
[GRCH37_VARIANT], inheritance_mode=inheritance_mode, inheritance_filter={'allowNoCall': True},
504504
)
505505

506+
def test_exclude_previous_search_results(self):
507+
VariantSearchResults.objects.create(variant_search_id=79516, search_hash='abc1234')
508+
self.mock_redis.get.side_effect = [None, json.dumps({'all_results': [
509+
VARIANT1, VARIANT2, [VARIANT3, VARIANT2], [GCNV_VARIANT4, GCNV_VARIANT3],
510+
]})]
511+
self.mock_redis.keys.side_effect = [[], ['search_results__abc1234__gnomad']]
512+
513+
self._assert_expected_search(
514+
[[MULTI_DATA_TYPE_COMP_HET_VARIANT2, GCNV_VARIANT4], [VARIANT3, VARIANT4], GCNV_VARIANT3, MITO_VARIANT3],
515+
inheritance_mode='recessive', **COMP_HET_ALL_PASS_FILTERS,
516+
exclude={'previousSearch': True, 'previousSearchHash': 'abc1234'}, cached_variant_fields=[
517+
[{'selectedGeneId': 'ENSG00000277258'}, {'selectedGeneId': 'ENSG00000277258'}],
518+
[{'selectedGeneId': 'ENSG00000097046'}, {'selectedGeneId': 'ENSG00000097046'}],
519+
{}, {},
520+
],
521+
)
522+
523+
self.mock_redis.get.side_effect = [None, json.dumps({'all_results': [
524+
[MULTI_DATA_TYPE_COMP_HET_VARIANT2, GCNV_VARIANT4], [VARIANT3, VARIANT4], GCNV_VARIANT3, MITO_VARIANT3,
525+
]})]
526+
self.mock_redis.keys.side_effect = [[], ['search_results__abc1234__gnomad']]
527+
self._assert_expected_search(
528+
[VARIANT2, [GCNV_VARIANT3, GCNV_VARIANT4]],
529+
inheritance_mode='recessive', cached_variant_fields=[
530+
{}, [{'selectedGeneId': 'ENSG00000275023'}, {'selectedGeneId': 'ENSG00000275023'}],
531+
],
532+
)
533+
506534
def test_quality_filter(self):
507535
quality_filter = {'vcf_filter': 'pass'}
508536
self._assert_expected_search(

seqr/utils/search/search_utils_tests.py

Lines changed: 57 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -279,9 +279,9 @@ def test_invalid_search_query_variants(self):
279279

280280
self._test_invalid_search_params(query_variants)
281281

282-
def _test_expected_search_call(self, mock_get_variants, results_cache, search_fields=None, has_gene_search=False,
282+
def _get_expected_search_call(self, results_cache, search_fields=None, has_gene_search=False,
283283
rs_ids=None, variant_ids=None, parsed_variant_ids=None, inheritance_mode='de_novo',
284-
dataset_type=None, secondary_dataset_type=None, omitted_sample_guids=None,
284+
dataset_type=None, secondary_dataset_type=None, exclude_keys=None, exclude_key_pairs=None,
285285
exclude_locations=False, exclude=None, annotations=None, annotations_secondary=None, single_gene_search=False, **kwargs):
286286
genes = intervals = None
287287
has_included_gene_search = has_gene_search and not exclude_locations
@@ -315,8 +315,16 @@ def _test_expected_search_call(self, mock_get_variants, results_cache, search_fi
315315
expected_search['annotations'] = annotations
316316
if annotations_secondary is not None:
317317
expected_search['annotations_secondary'] = annotations_secondary
318+
if exclude_keys is not None:
319+
expected_search['exclude_keys'] = exclude_keys
320+
if exclude_key_pairs is not None:
321+
expected_search['exclude_key_pairs'] = exclude_key_pairs
318322

319-
mock_get_variants.assert_called_with(mock.ANY, expected_search, self.user, results_cache, '37', **kwargs)
323+
return (mock.ANY, expected_search, self.user, results_cache, '37'), kwargs, genes
324+
325+
def _test_expected_search_call(self, mock_get_variants, *args, has_gene_search=False, exclude_locations=False, omitted_sample_guids=None, **kwargs):
326+
call_args, call_kwargs, genes = self._get_expected_search_call(*args, has_gene_search=has_gene_search, exclude_locations=exclude_locations, **kwargs)
327+
mock_get_variants.assert_called_with(*call_args, **call_kwargs)
320328
self._assert_expected_search_samples(mock_get_variants, omitted_sample_guids, has_gene_search and not exclude_locations)
321329

322330
if genes:
@@ -341,16 +349,17 @@ def _assert_expected_search_samples(self, mock_get_variants, omitted_sample_guid
341349

342350

343351
def test_query_variants(self, mock_get_variants):
352+
parsed_variants = [{**v, 'key': (i+1) * 1000 } for i, v in enumerate(PARSED_VARIANTS)]
344353
def _mock_get_variants(families, search, user, previous_search_results, genome_version, **kwargs):
345-
previous_search_results['all_results'] = PARSED_VARIANTS
354+
previous_search_results['all_results'] =parsed_variants
346355
previous_search_results['total_results'] = 5
347-
return PARSED_VARIANTS
356+
return parsed_variants
348357
mock_get_variants.side_effect = _mock_get_variants
349358

350359
variants, total = query_variants(self.results_model, user=self.user)
351-
self.assertListEqual(variants, PARSED_VARIANTS)
360+
self.assertListEqual(variants, parsed_variants)
352361
self.assertEqual(total, 5)
353-
results_cache = {'all_results': PARSED_VARIANTS, 'total_results': 5}
362+
results_cache = {'all_results': parsed_variants, 'total_results': 5}
354363
self.assert_cached_results(results_cache)
355364
self._test_expected_search_call(
356365
mock_get_variants, results_cache, sort='xpos', page=1, num_results=100, skip_genotype_filter=False,
@@ -466,6 +475,25 @@ def _mock_get_variants(families, search, user, previous_search_results, genome_v
466475
omitted_sample_guids=['S000145_hg00731', 'S000146_hg00732', 'S000148_hg00733'],
467476
)
468477

478+
self.set_cache(None)
479+
mock_get_variants.reset_mock()
480+
self.search_model.search = {
481+
'inheritance': {'mode': 'any_affected'},
482+
'exclude': {'previousSearch': True, 'previousSearchHash': 'abc1234', 'clinvar': ['benign']},
483+
}
484+
previous_search_model = VariantSearch.objects.create(search={'inheritance': {'mode': 'de_novo'}})
485+
previous_results_model = VariantSearchResults.objects.create(variant_search=previous_search_model, search_hash='abc1234')
486+
previous_results_model.families.set(self.families)
487+
query_variants(self.results_model, user=self.user)
488+
self._test_exclude_previous_search(
489+
mock_get_variants, results_cache, sort='xpos', page=1, num_results=100, skip_genotype_filter=False,
490+
inheritance_mode='any_affected', exclude={'clinvar': ['benign']}, search_fields=['exclude'],
491+
)
492+
493+
def _test_exclude_previous_search(self, mock_get_variants, *args, num_searches=1, **kwargs):
494+
self._test_expected_search_call(mock_get_variants, *args, **kwargs)
495+
self.assertEqual(mock_get_variants.call_count, num_searches)
496+
469497
def test_cached_query_variants(self):
470498
self.set_cache({'total_results': 4, 'all_results': self.CACHED_VARIANTS})
471499
variants, total = query_variants(self.results_model, user=self.user)
@@ -624,6 +652,28 @@ def _assert_expected_get_single_variant_call(self, mock_call, variant_id, expect
624652
def test_query_variants(self, mock_call):
625653
super().test_query_variants(mock_call)
626654

655+
def _test_exclude_previous_search(self, mock_get_variants, *args, **kwargs):
656+
super()._test_exclude_previous_search(
657+
mock_get_variants, *args, **kwargs,
658+
exclude_key_pairs={}, exclude_keys={'MITO': [1000, 2000]}, num_searches=2,
659+
)
660+
call_args, call_kwargs, _ = self._get_expected_search_call(*args, inheritance_mode='de_novo', sort=None, num_results=100)
661+
mock_get_variants.assert_has_calls([mock.call(*call_args, **call_kwargs)])
662+
663+
# Test when previous results are cached
664+
mock_get_variants.reset_mock()
665+
self.mock_redis.get.side_effect = [
666+
None,
667+
json.dumps({'all_results': [VARIANT1, [VARIANT1, VARIANT2], [VARIANT1, SV_VARIANT1], VARIANT2, [VARIANT4, VARIANT3], SV_VARIANT1]}),
668+
]
669+
self.mock_redis.keys.side_effect = [[], ['search_results__abc1234__gnomad']]
670+
query_variants(self.results_model, user=self.user)
671+
super()._test_exclude_previous_search(
672+
mock_get_variants, *args, **kwargs, num_searches=1,
673+
exclude_key_pairs={'SNV_INDEL': [[1, 2], [3, 4]], 'SNV_INDEL,SV_WGS': [[1, 12]]},
674+
exclude_keys={'SNV_INDEL': [1, 2], 'SV_WGS': [12]},
675+
)
676+
627677
def test_cached_query_variants(self):
628678
Project.objects.filter(id=1).update(genome_version='38')
629679
super().test_cached_query_variants()

seqr/utils/search/utils.py

Lines changed: 36 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
from clickhouse_search.search import get_clickhouse_variants, format_clickhouse_results, \
88
get_clickhouse_cache_results, clickhouse_variant_lookup, get_clickhouse_variant_by_id
99
from reference_data.models import GENOME_VERSION_GRCh38, GENOME_VERSION_GRCh37
10-
from seqr.models import Sample, Individual, Project
10+
from seqr.models import Sample, Individual, Project, VariantSearchResults
1111
from seqr.utils.logging_utils import SeqrLogger
1212
from seqr.utils.redis_utils import safe_redis_get_json, safe_redis_get_wildcard_json, safe_redis_set_json
1313
from seqr.utils.search.constants import XPOS_SORT_KEY, PRIORITIZED_GENE_SORT, RECESSIVE, COMPOUND_HET, \
@@ -293,6 +293,13 @@ def _query_variants(search_model, user, previous_search_results, genome_version,
293293
if duplicates:
294294
raise InvalidSearchException(f'ClinVar pathogenicity {", ".join(sorted(duplicates))} is both included and excluded')
295295

296+
exclude.pop('previousSearch', None)
297+
exclude_previous_hash = exclude.pop('previousSearchHash', None)
298+
if exclude_previous_hash:
299+
parsed_search.update(backend_specific_call(
300+
lambda *args: {}, _get_clickhouse_exclude_keys,
301+
)(exclude_previous_hash, user, genome_version))
302+
296303
for annotation_key in ['annotations', 'annotations_secondary']:
297304
if parsed_search.get(annotation_key):
298305
parsed_search[annotation_key] = {k: v for k, v in parsed_search[annotation_key].items() if v}
@@ -323,6 +330,34 @@ def _query_variants(search_model, user, previous_search_results, genome_version,
323330
return variant_results, previous_search_results.get('total_results')
324331

325332

333+
def _get_clickhouse_exclude_keys(search_hash, user, genome_version):
334+
previous_search_model = VariantSearchResults.objects.get(search_hash=search_hash)
335+
cached_results = _get_any_sort_cached_results(previous_search_model)
336+
if not cached_results:
337+
cached_results = {}
338+
_query_variants(previous_search_model, user, cached_results, genome_version)
339+
results = cached_results['all_results']
340+
exclude_keys = defaultdict(list)
341+
exclude_key_pairs = defaultdict(list)
342+
for variant in results:
343+
if isinstance(variant, list):
344+
dt1= variant_dataset_type(variant[0])
345+
dt2 = variant_dataset_type(variant[1])
346+
dataset_type = dt1 if dt1 == dt2 else ','.join(sorted([dt1, dt2]))
347+
exclude_key_pairs[dataset_type].append(sorted([variant[0]['key'], variant[1]['key']]))
348+
else:
349+
dataset_type = variant_dataset_type(variant)
350+
exclude_keys[dataset_type].append(variant['key'])
351+
return {'exclude_keys': dict(exclude_keys), 'exclude_key_pairs': dict(exclude_key_pairs)}
352+
353+
354+
def variant_dataset_type(variant):
355+
if not parse_variant_id(variant['variantId']):
356+
sample_type = Sample.SAMPLE_TYPE_WGS if 'endChrom' in variant else Sample.SAMPLE_TYPE_WES
357+
return f'{Sample.DATASET_TYPE_SV_CALLS}_{sample_type}'
358+
return Sample.DATASET_TYPE_MITO_CALLS if 'mitomapPathogenic' in variant else Sample.DATASET_TYPE_VARIANT_CALLS
359+
360+
326361
def get_variant_query_gene_counts(search_model, user):
327362
return backend_specific_call(
328363
_get_es_variant_query_gene_counts,

seqr/views/apis/variant_search_api.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,8 @@ def _get_or_create_results_model(search_hash, search_context, user):
113113
).distinct()
114114

115115
search_dict = search_context.get('search', {})
116+
if search_context.get('previousSearchHash') and (search_dict.get('exclude') or {}).get('previousSearch'):
117+
search_dict['exclude']['previousSearchHash'] = search_context['previousSearchHash']
116118
search_model = VariantSearch.objects.filter(search=search_dict).filter(
117119
Q(created_by=user) | Q(name__isnull=False)).first()
118120
if not search_model:

0 commit comments

Comments
 (0)