@@ -3,6 +3,7 @@ version 1.0
33import "Structs.wdl"
44import "PlotSVCountsPerSample.wdl" as plot_svcounts
55import "Utils.wdl" as util
6+ import "TasksMakeCohortVcf.wdl" as cohort_utils
67import "CollectQcVcfWide.wdl" as qc_utils
78import "FilterOutlierSamplesPostMinGQ.wdl" as legacy
89
@@ -19,6 +20,7 @@ workflow IdentifyOutlierSamples {
1920 String ? bcftools_preprocessing_options
2021 Boolean plot_counts = false
2122 Array [Pair [String , File ]]? sample_subsets # if provided, will identify outliers separately within each subset. Expected format is array of pairs, where pair.left is the subset name and pair.right is a text file with all relevant sample IDs
23+ Int samples_per_shard = 5000
2224 String sv_pipeline_docker
2325 String sv_base_mini_docker
2426 String linux_docker
@@ -34,15 +36,14 @@ workflow IdentifyOutlierSamples {
3436
3537 String prefix = if (defined (vcf_identifier )) then "~{name }_~{vcf_identifier }" else name
3638
37- if (!defined (sample_subsets )) {
38- call util .GetSampleIdsFromVcf as GetSamplesList {
39- input :
40- vcf = vcfs [0 ],
41- sv_base_mini_docker = sv_base_mini_docker ,
42- runtime_attr_override = runtime_attr_ids_from_vcf
43- }
39+ call util .GetSampleIdsFromVcf as GetSamplesList {
40+ input :
41+ vcf = vcfs [0 ],
42+ sv_base_mini_docker = sv_base_mini_docker ,
43+ runtime_attr_override = runtime_attr_ids_from_vcf
4444 }
45- Array [Pair [String , File ]] subsets_to_eval = select_first ([sample_subsets , [("ALL" , select_all ([GetSamplesList .out_file ]))]])
45+ Array [Pair [String , File ]] default_subsets = [("ALL" , GetSamplesList .out_file )]
46+ Array [Pair [String , File ]] subsets_to_eval = select_first ([sample_subsets , default_subsets ])
4647
4748 # Collect SV counts for each VCF in parallel unless sv_counts is provided
4849 if (!defined (sv_counts )) {
@@ -71,16 +72,41 @@ workflow IdentifyOutlierSamples {
7172 }
7273 }
7374
74- # Combine counts across all VCFs
75- call legacy . CombineCounts as Combine {
75+ # Combine counts across all VCFs (scattered over sample chunks)
76+ call cohort_utils . SplitUncompressed as ShardSamples {
7677 input :
77- svcounts = CountPerVcf .sv_counts ,
78- prefix = prefix ,
79- sv_pipeline_docker = sv_pipeline_docker ,
80- runtime_attr_override = runtime_attr_combine_counts
78+ whole_file = GetSamplesList .out_file ,
79+ lines_per_shard = samples_per_shard ,
80+ shard_prefix = prefix ,
81+ shuffle_file = true ,
82+ random_seed = 2023 ,
83+ sv_pipeline_docker = sv_pipeline_docker
84+ }
85+ scatter ( sample_shard in ShardSamples .shards ) {
86+ call SubsetCounts as SubsetPreCombine {
87+ input :
88+ svcounts = CountPerVcf .sv_counts ,
89+ samples_list = sample_shard ,
90+ outfile = "${prefix }.shard.counts.tsv" ,
91+ linux_docker = linux_docker ,
92+ runtime_attr_override = runtime_attr_subset_counts
93+ }
94+ call legacy .CombineCounts as CombineShard {
95+ input :
96+ svcounts = [SubsetPreCombine .counts_subset ],
97+ prefix = prefix ,
98+ sv_pipeline_docker = sv_pipeline_docker ,
99+ runtime_attr_override = runtime_attr_combine_counts
100+ }
101+ }
102+ call CatCounts as Combine {
103+ input :
104+ svcounts = CombineShard .summed_svcounts ,
105+ outfile = "${prefix }.merged.counts.tsv" ,
106+ linux_docker = linux_docker
81107 }
82108 }
83- File final_counts = select_first ([sv_counts , Combine .summed_svcounts ])
109+ File final_counts = select_first ([sv_counts , Combine .merged_counts ])
84110
85111 # If a precomputed outlier table is provided, directly apply those cutoffs
86112 if (defined (outlier_cutoff_table )) {
@@ -101,7 +127,7 @@ workflow IdentifyOutlierSamples {
101127 scatter ( subset_info in subsets_to_eval ) {
102128 call SubsetCounts {
103129 input :
104- svcounts = final_counts ,
130+ svcounts = [ final_counts ] ,
105131 samples_list = subset_info .right ,
106132 outfile = "${prefix }.${subset_info .left }.counts.tsv" ,
107133 linux_docker = linux_docker ,
@@ -249,7 +275,7 @@ task IdentifyOutliersByCutoffTable {
249275# Restrict a file of SV counts per sample to a subset of samples
250276task SubsetCounts {
251277 input {
252- File svcounts
278+ Array [ File ] svcounts
253279 File samples_list
254280 String outfile
255281 String linux_docker
@@ -273,8 +299,10 @@ task SubsetCounts {
273299 command <<<
274300
275301 set -euo pipefail
276- head -n1 ~{svcounts } > ~{outfile }
277- fgrep -wf ~{samples_list } ~{svcounts } >> ~{outfile }
302+ head -n1 ~{svcounts [0 ]} > ~{outfile }
303+ for file in ~{sep =" " svcounts }; do
304+ fgrep -wf ~{samples_list } "$file " >> ~{outfile }
305+ done
278306
279307 >>>
280308
@@ -290,6 +318,50 @@ task SubsetCounts {
290318
291319}
292320
321+ # Naive concatenation of multiple counts files while accounting for header
322+ task CatCounts {
323+ input {
324+ Array [File ] svcounts
325+ String outfile
326+ String linux_docker
327+ RuntimeAttr ? runtime_attr_override
328+ }
329+
330+ RuntimeAttr default_attr = object {
331+ cpu_cores : 1 ,
332+ mem_gb : 3.75 ,
333+ disk_gb : 10 ,
334+ boot_disk_gb : 10 ,
335+ preemptible_tries : 3 ,
336+ max_retries : 1
337+ }
338+ RuntimeAttr runtime_attr = select_first ([runtime_attr_override , default_attr ])
339+
340+ output {
341+ File merged_counts = "${outfile }"
342+ }
343+
344+ command <<<
345+
346+ set -euo pipefail
347+ head -n1 ~{svcounts [0 ]} > ~{outfile }
348+ for file in ~{sep =" " svcounts }; do
349+ cat "$file " | sed '1d' >> ~{outfile }
350+ done
351+
352+ >>>
353+
354+ runtime {
355+ cpu : select_first ([runtime_attr .cpu_cores , default_attr .cpu_cores ])
356+ memory : select_first ([runtime_attr .mem_gb , default_attr .mem_gb ]) + " GiB"
357+ disks : "local-disk " + select_first ([runtime_attr .disk_gb , default_attr .disk_gb ]) + " HDD"
358+ bootDiskSizeGb : select_first ([runtime_attr .boot_disk_gb , default_attr .boot_disk_gb ])
359+ docker : linux_docker
360+ preemptible : select_first ([runtime_attr .preemptible_tries , default_attr .preemptible_tries ])
361+ maxRetries : select_first ([runtime_attr .max_retries , default_attr .max_retries ])
362+ }
363+
364+ }
293365
294366# Merge outlier sample lists across algorithms
295367task CatOutliers {
0 commit comments