Skip to content

Commit cdca408

Browse files
committed
Final update before PR
1 parent c9b332f commit cdca408

File tree

2 files changed

+93
-19
lines changed

2 files changed

+93
-19
lines changed

wdl/FilterOutlierSamples.wdl

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ workflow FilterOutlierSamples {
1818
Boolean plot_counts = false
1919
Array[String]? sample_subset_prefixes # if provided, will identify outliers separately within each subset
2020
Array[String]? sample_subset_lists # if provided, will identify outliers separately within each subset
21+
Int samples_per_shard = 5000
2122
String sv_pipeline_docker
2223
String sv_base_mini_docker
2324
String linux_docker
@@ -49,6 +50,7 @@ workflow FilterOutlierSamples {
4950
bcftools_preprocessing_options = bcftools_preprocessing_options,
5051
plot_counts = plot_counts,
5152
sample_subsets = sample_subsets,
53+
samples_per_shard = samples_per_shard,
5254
sv_pipeline_docker = sv_pipeline_docker,
5355
sv_base_mini_docker = sv_base_mini_docker,
5456
linux_docker = linux_docker,

wdl/IdentifyOutlierSamples.wdl

Lines changed: 91 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ version 1.0
33
import "Structs.wdl"
44
import "PlotSVCountsPerSample.wdl" as plot_svcounts
55
import "Utils.wdl" as util
6+
import "TasksMakeCohortVcf.wdl" as cohort_utils
67
import "CollectQcVcfWide.wdl" as qc_utils
78
import "FilterOutlierSamplesPostMinGQ.wdl" as legacy
89

@@ -19,6 +20,7 @@ workflow IdentifyOutlierSamples {
1920
String? bcftools_preprocessing_options
2021
Boolean plot_counts = false
2122
Array[Pair[String, File]]? sample_subsets # if provided, will identify outliers separately within each subset. Expected format is array of pairs, where pair.left is the subset name and pair.right is a text file with all relevant sample IDs
23+
Int samples_per_shard = 5000
2224
String sv_pipeline_docker
2325
String sv_base_mini_docker
2426
String linux_docker
@@ -34,15 +36,14 @@ workflow IdentifyOutlierSamples {
3436

3537
String prefix = if (defined(vcf_identifier)) then "~{name}_~{vcf_identifier}" else name
3638

37-
if (!defined(sample_subsets)) {
38-
call util.GetSampleIdsFromVcf as GetSamplesList {
39-
input:
40-
vcf = vcfs[0],
41-
sv_base_mini_docker = sv_base_mini_docker,
42-
runtime_attr_override = runtime_attr_ids_from_vcf
43-
}
39+
call util.GetSampleIdsFromVcf as GetSamplesList {
40+
input:
41+
vcf = vcfs[0],
42+
sv_base_mini_docker = sv_base_mini_docker,
43+
runtime_attr_override = runtime_attr_ids_from_vcf
4444
}
45-
Array[Pair[String, File]] subsets_to_eval = select_first([sample_subsets, [("ALL", select_all([GetSamplesList.out_file]))]])
45+
Array[Pair[String, File]] default_subsets = [("ALL", GetSamplesList.out_file)]
46+
Array[Pair[String, File]] subsets_to_eval = select_first([sample_subsets, default_subsets])
4647
4748
# Collect SV counts for each VCF in parallel unless sv_counts is provided
4849
if (!defined(sv_counts)) {
@@ -71,16 +72,41 @@ workflow IdentifyOutlierSamples {
7172
}
7273
}
7374
74-
# Combine counts across all VCFs
75-
call legacy.CombineCounts as Combine {
75+
# Combine counts across all VCFs (scattered over sample chunks)
76+
call cohort_utils.SplitUncompressed as ShardSamples {
7677
input:
77-
svcounts = CountPerVcf.sv_counts,
78-
prefix = prefix,
79-
sv_pipeline_docker = sv_pipeline_docker,
80-
runtime_attr_override = runtime_attr_combine_counts
78+
whole_file = GetSamplesList.out_file,
79+
lines_per_shard = samples_per_shard,
80+
shard_prefix = prefix,
81+
shuffle_file = true,
82+
random_seed = 2023,
83+
sv_pipeline_docker = sv_pipeline_docker
84+
}
85+
scatter ( sample_shard in ShardSamples.shards ) {
86+
call SubsetCounts as SubsetPreCombine {
87+
input:
88+
svcounts = CountPerVcf.sv_counts,
89+
samples_list = sample_shard,
90+
outfile = "${prefix}.shard.counts.tsv",
91+
linux_docker = linux_docker,
92+
runtime_attr_override = runtime_attr_subset_counts
93+
}
94+
call legacy.CombineCounts as CombineShard {
95+
input:
96+
svcounts = [SubsetPreCombine.counts_subset],
97+
prefix = prefix,
98+
sv_pipeline_docker = sv_pipeline_docker,
99+
runtime_attr_override = runtime_attr_combine_counts
100+
}
101+
}
102+
call CatCounts as Combine {
103+
input:
104+
svcounts = CombineShard.summed_svcounts,
105+
outfile = "${prefix}.merged.counts.tsv",
106+
linux_docker = linux_docker
81107
}
82108
}
83-
File final_counts = select_first([sv_counts, Combine.summed_svcounts])
109+
File final_counts = select_first([sv_counts, Combine.merged_counts])
84110
85111
# If a precomputed outlier table is provided, directly apply those cutoffs
86112
if (defined(outlier_cutoff_table)) {
@@ -101,7 +127,7 @@ workflow IdentifyOutlierSamples {
101127
scatter ( subset_info in subsets_to_eval ) {
102128
call SubsetCounts {
103129
input:
104-
svcounts = final_counts,
130+
svcounts = [final_counts],
105131
samples_list = subset_info.right,
106132
outfile = "${prefix}.${subset_info.left}.counts.tsv",
107133
linux_docker = linux_docker,
@@ -249,7 +275,7 @@ task IdentifyOutliersByCutoffTable {
249275
# Restrict a file of SV counts per sample to a subset of samples
250276
task SubsetCounts {
251277
input {
252-
File svcounts
278+
Array[File] svcounts
253279
File samples_list
254280
String outfile
255281
String linux_docker
@@ -273,8 +299,10 @@ task SubsetCounts {
273299
command <<<
274300
275301
set -euo pipefail
276-
head -n1 ~{svcounts} > ~{outfile}
277-
fgrep -wf ~{samples_list} ~{svcounts} >> ~{outfile}
302+
head -n1 ~{svcounts[0]} > ~{outfile}
303+
for file in ~{sep=" " svcounts}; do
304+
fgrep -wf ~{samples_list} "$file" >> ~{outfile}
305+
done
278306
279307
>>>
280308

@@ -290,6 +318,50 @@ task SubsetCounts {
290318

291319
}
292320

321+
# Naive concatenation of multiple counts files while accounting for header
322+
task CatCounts {
323+
input {
324+
Array[File] svcounts
325+
String outfile
326+
String linux_docker
327+
RuntimeAttr? runtime_attr_override
328+
}
329+
330+
RuntimeAttr default_attr = object {
331+
cpu_cores: 1,
332+
mem_gb: 3.75,
333+
disk_gb: 10,
334+
boot_disk_gb: 10,
335+
preemptible_tries: 3,
336+
max_retries: 1
337+
}
338+
RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr])
339+
340+
output {
341+
File merged_counts = "${outfile}"
342+
}
343+
344+
command <<<
345+
346+
set -euo pipefail
347+
head -n1 ~{svcounts[0]} > ~{outfile}
348+
for file in ~{sep=" " svcounts}; do
349+
cat "$file" | sed '1d' >> ~{outfile}
350+
done
351+
352+
>>>
353+
354+
runtime {
355+
cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores])
356+
memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB"
357+
disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD"
358+
bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb])
359+
docker: linux_docker
360+
preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries])
361+
maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries])
362+
}
363+
364+
}
293365

294366
# Merge outlier sample lists across algorithms
295367
task CatOutliers {

0 commit comments

Comments
 (0)