1010import hail as hl
1111from gnomad .sample_qc .relatedness import (
1212 create_fake_pedigree ,
13+ filter_to_trios ,
1314 get_duplicated_samples ,
1415 get_duplicated_samples_ht ,
1516 infer_families ,
2425)
2526from gnomad_qc .slack_creds import slack_token
2627from gnomad_qc .v4 .resources .basics import get_gnomad_v4_vds
27- from gnomad_qc .v4 .resources .meta import project_meta
28+ from gnomad_qc .v4 .resources .meta import meta , project_meta
2829from gnomad_qc .v4 .resources .sample_qc import (
30+ dense_trio_mt ,
2931 duplicates ,
3032 finalized_outlier_filtering ,
3133 interval_qc_pass ,
@@ -385,12 +387,64 @@ def filter_ped(
385387 return hl .Pedigree ([trio for trio in ped .trios if trio .s in trios ]), cutoffs
386388
387389
388- def get_trio_resources (overwrite : bool , test : bool ) -> PipelineResourceCollection :
390+ def create_dense_trio_mt (
391+ fam_ht : hl .Table ,
392+ meta_ht : hl .Table ,
393+ releasable_only : bool = True ,
394+ test : bool = False ,
395+ naive_coalesce_partitions : Optional [int ] = None ,
396+ ) -> hl .MatrixTable :
397+ """
398+ Create a dense MatrixTable for high quality trios.
399+
400+ :param fam_ht: Table with family information.
401+ :param meta_ht: Table with metadata information.
402+ :param releasable_only: Whether to only include trios that are releasable. Default
403+ is True.
404+ :param test: Whether to filter to chr20 for testing. Default is False.
405+ :param naive_coalesce_partitions: Optional Number of partitions to coalesce the VDS
406+ to. Default is None.
407+ :return: Dense MatrixTable with high quality trios.
408+ """
409+ filter_expr = meta_ht .high_quality
410+ if releasable_only :
411+ filter_expr &= meta_ht .project_meta .releasable
412+
413+ # Filter the metadata table to only samples in high quality and releasable trios.
414+ meta_ht = meta_ht .filter (filter_expr )
415+ fam_ht = fam_ht .filter (
416+ hl .is_defined (meta_ht [fam_ht .id ])
417+ & hl .is_defined (meta_ht [fam_ht .pat_id ])
418+ & hl .is_defined (meta_ht [fam_ht .mat_id ])
419+ )
420+ meta_ht = filter_to_trios (meta_ht , fam_ht )
421+
422+ # Get the gnomAD VDS filtered to high quality releasable trios.
423+ # Using 'entries_to_keep' to keep all entries that are not `gvcf_info` because it
424+ # is likely not needed, and removal will reduce the size of the dense MatrixTable.
425+ vds = get_gnomad_v4_vds (
426+ high_quality_only = True ,
427+ chrom = "chr20" if test else None ,
428+ filter_samples_ht = meta_ht ,
429+ entries_to_keep = ["LA" , "LGT" , "LAD" , "LPGT" , "LPL" , "DP" , "GQ" , "SB" ],
430+ naive_coalesce_partitions = naive_coalesce_partitions ,
431+ )
432+
433+ return hl .vds .to_dense_mt (vds )
434+
435+
436+ def get_trio_resources (
437+ overwrite : bool ,
438+ test : bool ,
439+ dense_trio_mt_releasable_only : bool = True ,
440+ ) -> PipelineResourceCollection :
389441 """
390442 Get PipelineResourceCollection for all resources needed in the trio identification pipeline.
391443
392444 :param overwrite: Whether to overwrite existing resources.
393445 :param test: Whether to use test resources.
446+ :param dense_trio_mt_releasable_only: Whether to only include trios that are
447+ releasable in the dense trio MT. Default is True.
394448 :return: PipelineResourceCollection containing resources for all steps of the
395449 trio identification pipeline.
396450 """
@@ -457,6 +511,16 @@ def get_trio_resources(overwrite: bool, test: bool) -> PipelineResourceCollectio
457511 },
458512 pipeline_input_steps = [infer_families , run_mendel_errors ],
459513 )
514+ create_dense_trio_mt = PipelineStepResourceCollection (
515+ "--create-dense-trio-mt" ,
516+ output_resources = {
517+ "dense_trio_mt" : dense_trio_mt (
518+ releasable = dense_trio_mt_releasable_only , test = test
519+ )
520+ },
521+ pipeline_input_steps = [finalize_ped ],
522+ add_input_resources = {"Finalized metadata HT" : {"meta_ht" : meta ()}},
523+ )
460524
461525 # Add all steps to the trio identification pipeline resource collection.
462526 trio_pipeline .add_steps (
@@ -466,6 +530,7 @@ def get_trio_resources(overwrite: bool, test: bool) -> PipelineResourceCollectio
466530 "create_fake_pedigree" : create_fake_pedigree ,
467531 "run_mendel_errors" : run_mendel_errors ,
468532 "finalize_ped" : finalize_ped ,
533+ "create_dense_trio_mt" : create_dense_trio_mt ,
469534 }
470535 )
471536
@@ -566,6 +631,17 @@ def main(args):
566631 with hl .hadoop_open (res .filter_json , "w" ) as d :
567632 d .write (json .dumps (filters ))
568633
634+ if args .create_dense_trio_mt :
635+ res = trio_resources .create_dense_trio_mt
636+ res .check_resource_existence ()
637+ create_dense_trio_mt (
638+ res .final_ped .ht (),
639+ res .meta_ht .ht (),
640+ args .releasable_only ,
641+ test ,
642+ naive_coalesce_partitions = args .naive_coalesce_partitions ,
643+ ).write (res .dense_trio_mt .path , overwrite = args .overwrite )
644+
569645
570646def get_script_argument_parser () -> argparse .ArgumentParser :
571647 """Get script argument parser."""
@@ -724,6 +800,23 @@ def get_script_argument_parser() -> argparse.ArgumentParser:
724800 type = int ,
725801 default = 24 ,
726802 )
803+ dense_trio_mt_args = parser .add_argument_group ("Create dense trio MT." )
804+ dense_trio_mt_args .add_argument (
805+ "--create-dense-trio-mt" ,
806+ help = ("Create a dense MT for high quality trios." ),
807+ action = "store_true" ,
808+ )
809+ dense_trio_mt_args .add_argument (
810+ "--releasable-only" ,
811+ help = ("Only include trios that are releasable." ),
812+ action = "store_true" ,
813+ )
814+ dense_trio_mt_args .add_argument (
815+ "--naive-coalesce-partitions" ,
816+ help = ("Number of partitions to coalesce the VDS to." ),
817+ type = int ,
818+ default = 5000 ,
819+ )
727820
728821 return parser
729822
0 commit comments