Skip to content

Commit d829f26

Browse files
authored
Merge pull request #651 from broadinstitute/jg/change_denovo_to_make_full_trio_mt
Make dense trio MT
2 parents af3f365 + 350d345 commit d829f26

File tree

3 files changed

+129
-2
lines changed

3 files changed

+129
-2
lines changed

gnomad_qc/v4/resources/basics.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@ def get_gnomad_v4_vds(
5252
entries_to_keep: Optional[List[str]] = None,
5353
annotate_het_non_ref: bool = False,
5454
checkpoint_variant_data: bool = False,
55+
naive_coalesce_partitions: Optional[int] = None,
5556
) -> hl.vds.VariantDataset:
5657
"""
5758
Get gnomAD v4 data with desired filtering and metadata annotations.
@@ -96,6 +97,8 @@ def get_gnomad_v4_vds(
9697
'_het_non_ref') to the variant data. Default is False.
9798
:param checkpoint_variant_data: Whether to checkpoint the variant data MT after
9899
splitting and filtering. Default is False.
100+
:param naive_coalesce_partitions: Optional argument to coalesce the VDS to a
101+
specific number of partitions using naive coalesce.
99102
:return: gnomAD v4 dataset with chosen annotations and filters.
100103
"""
101104
if remove_hard_filtered_samples and remove_hard_filtered_samples_no_sex:
@@ -158,6 +161,12 @@ def get_gnomad_v4_vds(
158161
logger.info("Filtering to chromosome %s...", chrom)
159162
vds = hl.vds.filter_chromosomes(vds, keep=chrom)
160163

164+
if naive_coalesce_partitions:
165+
vds = hl.vds.VariantDataset(
166+
vds.reference_data.naive_coalesce(naive_coalesce_partitions),
167+
vds.variant_data.naive_coalesce(naive_coalesce_partitions),
168+
)
169+
161170
if filter_partitions:
162171
logger.info("Filtering to %s partitions...", len(filter_partitions))
163172
vds = hl.vds.VariantDataset(

gnomad_qc/v4/resources/sample_qc.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -997,6 +997,31 @@ def ped_filter_param_json_path(
997997
return f"{get_sample_qc_root(version)}/relatedness/trios/gnomad.exomes.v{version}.ped_filters.json"
998998

999999

1000+
def dense_trio_mt(
1001+
releasable: bool = True,
1002+
test: bool = False,
1003+
) -> VersionedMatrixTableResource:
1004+
"""
1005+
Get the VersionedMatrixTableResource for the dense trio MatrixTable.
1006+
1007+
:param releasable: Whether to get the resource for the releasable trios only.
1008+
:param test: Whether to use a tmp path for a test resource.
1009+
:return: VersionedMatrixTableResource of dense trio MatrixTable.
1010+
"""
1011+
data_type = "exomes"
1012+
return VersionedMatrixTableResource(
1013+
CURRENT_SAMPLE_QC_VERSION,
1014+
{
1015+
version: MatrixTableResource(
1016+
f"{get_sample_qc_root(version, test, data_type='exomes')}"
1017+
f"/relatedness/trios/gnomad.{data_type}.v{version}.trios"
1018+
f"{'.releasable' if releasable else ''}.dense.mt"
1019+
)
1020+
for version in SAMPLE_QC_VERSIONS
1021+
},
1022+
)
1023+
1024+
10001025
######################################################################
10011026
# Other resources
10021027
######################################################################

gnomad_qc/v4/sample_qc/identify_trios.py

Lines changed: 95 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
import hail as hl
1111
from gnomad.sample_qc.relatedness import (
1212
create_fake_pedigree,
13+
filter_to_trios,
1314
get_duplicated_samples,
1415
get_duplicated_samples_ht,
1516
infer_families,
@@ -24,8 +25,9 @@
2425
)
2526
from gnomad_qc.slack_creds import slack_token
2627
from gnomad_qc.v4.resources.basics import get_gnomad_v4_vds
27-
from gnomad_qc.v4.resources.meta import project_meta
28+
from gnomad_qc.v4.resources.meta import meta, project_meta
2829
from gnomad_qc.v4.resources.sample_qc import (
30+
dense_trio_mt,
2931
duplicates,
3032
finalized_outlier_filtering,
3133
interval_qc_pass,
@@ -385,12 +387,64 @@ def filter_ped(
385387
return hl.Pedigree([trio for trio in ped.trios if trio.s in trios]), cutoffs
386388

387389

388-
def get_trio_resources(overwrite: bool, test: bool) -> PipelineResourceCollection:
390+
def create_dense_trio_mt(
391+
fam_ht: hl.Table,
392+
meta_ht: hl.Table,
393+
releasable_only: bool = True,
394+
test: bool = False,
395+
naive_coalesce_partitions: Optional[int] = None,
396+
) -> hl.MatrixTable:
397+
"""
398+
Create a dense MatrixTable for high quality trios.
399+
400+
:param fam_ht: Table with family information.
401+
:param meta_ht: Table with metadata information.
402+
:param releasable_only: Whether to only include trios that are releasable. Default
403+
is True.
404+
:param test: Whether to filter to chr20 for testing. Default is False.
405+
:param naive_coalesce_partitions: Optional Number of partitions to coalesce the VDS
406+
to. Default is None.
407+
:return: Dense MatrixTable with high quality trios.
408+
"""
409+
filter_expr = meta_ht.high_quality
410+
if releasable_only:
411+
filter_expr &= meta_ht.project_meta.releasable
412+
413+
# Filter the metadata table to only samples in high quality and releasable trios.
414+
meta_ht = meta_ht.filter(filter_expr)
415+
fam_ht = fam_ht.filter(
416+
hl.is_defined(meta_ht[fam_ht.id])
417+
& hl.is_defined(meta_ht[fam_ht.pat_id])
418+
& hl.is_defined(meta_ht[fam_ht.mat_id])
419+
)
420+
meta_ht = filter_to_trios(meta_ht, fam_ht)
421+
422+
# Get the gnomAD VDS filtered to high quality releasable trios.
423+
# Using 'entries_to_keep' to keep all entries that are not `gvcf_info` because it
424+
# is likely not needed, and removal will reduce the size of the dense MatrixTable.
425+
vds = get_gnomad_v4_vds(
426+
high_quality_only=True,
427+
chrom="chr20" if test else None,
428+
filter_samples_ht=meta_ht,
429+
entries_to_keep=["LA", "LGT", "LAD", "LPGT", "LPL", "DP", "GQ", "SB"],
430+
naive_coalesce_partitions=naive_coalesce_partitions,
431+
)
432+
433+
return hl.vds.to_dense_mt(vds)
434+
435+
436+
def get_trio_resources(
437+
overwrite: bool,
438+
test: bool,
439+
dense_trio_mt_releasable_only: bool = True,
440+
) -> PipelineResourceCollection:
389441
"""
390442
Get PipelineResourceCollection for all resources needed in the trio identification pipeline.
391443
392444
:param overwrite: Whether to overwrite existing resources.
393445
:param test: Whether to use test resources.
446+
:param dense_trio_mt_releasable_only: Whether to only include trios that are
447+
releasable in the dense trio MT. Default is True.
394448
:return: PipelineResourceCollection containing resources for all steps of the
395449
trio identification pipeline.
396450
"""
@@ -457,6 +511,16 @@ def get_trio_resources(overwrite: bool, test: bool) -> PipelineResourceCollectio
457511
},
458512
pipeline_input_steps=[infer_families, run_mendel_errors],
459513
)
514+
create_dense_trio_mt = PipelineStepResourceCollection(
515+
"--create-dense-trio-mt",
516+
output_resources={
517+
"dense_trio_mt": dense_trio_mt(
518+
releasable=dense_trio_mt_releasable_only, test=test
519+
)
520+
},
521+
pipeline_input_steps=[finalize_ped],
522+
add_input_resources={"Finalized metadata HT": {"meta_ht": meta()}},
523+
)
460524

461525
# Add all steps to the trio identification pipeline resource collection.
462526
trio_pipeline.add_steps(
@@ -466,6 +530,7 @@ def get_trio_resources(overwrite: bool, test: bool) -> PipelineResourceCollectio
466530
"create_fake_pedigree": create_fake_pedigree,
467531
"run_mendel_errors": run_mendel_errors,
468532
"finalize_ped": finalize_ped,
533+
"create_dense_trio_mt": create_dense_trio_mt,
469534
}
470535
)
471536

@@ -566,6 +631,17 @@ def main(args):
566631
with hl.hadoop_open(res.filter_json, "w") as d:
567632
d.write(json.dumps(filters))
568633

634+
if args.create_dense_trio_mt:
635+
res = trio_resources.create_dense_trio_mt
636+
res.check_resource_existence()
637+
create_dense_trio_mt(
638+
res.final_ped.ht(),
639+
res.meta_ht.ht(),
640+
args.releasable_only,
641+
test,
642+
naive_coalesce_partitions=args.naive_coalesce_partitions,
643+
).write(res.dense_trio_mt.path, overwrite=args.overwrite)
644+
569645

570646
def get_script_argument_parser() -> argparse.ArgumentParser:
571647
"""Get script argument parser."""
@@ -724,6 +800,23 @@ def get_script_argument_parser() -> argparse.ArgumentParser:
724800
type=int,
725801
default=24,
726802
)
803+
dense_trio_mt_args = parser.add_argument_group("Create dense trio MT.")
804+
dense_trio_mt_args.add_argument(
805+
"--create-dense-trio-mt",
806+
help=("Create a dense MT for high quality trios."),
807+
action="store_true",
808+
)
809+
dense_trio_mt_args.add_argument(
810+
"--releasable-only",
811+
help=("Only include trios that are releasable."),
812+
action="store_true",
813+
)
814+
dense_trio_mt_args.add_argument(
815+
"--naive-coalesce-partitions",
816+
help=("Number of partitions to coalesce the VDS to."),
817+
type=int,
818+
default=5000,
819+
)
727820

728821
return parser
729822

0 commit comments

Comments
 (0)