From aff60eef2febea8eb7f48538ca1b5548c9445135 Mon Sep 17 00:00:00 2001
From: Ryan Collins <rlcollins@g.harvard.edu>
Date: Mon, 9 Jan 2023 16:29:31 -0500
Subject: [PATCH 1/7] Add BCFTools preprocessing option to
 PlotSVCountsPerSample

---
 wdl/PlotSVCountsPerSample.wdl | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/wdl/PlotSVCountsPerSample.wdl b/wdl/PlotSVCountsPerSample.wdl
index 65602920f..0c9edab16 100644
--- a/wdl/PlotSVCountsPerSample.wdl
+++ b/wdl/PlotSVCountsPerSample.wdl
@@ -1,6 +1,7 @@
 version 1.0
 
 import "Structs.wdl"
+import "CollectQcVcfWide.wdl" as qc_utils
 
 # Workflow to count SVs per sample per type and 
 workflow PlotSVCountsPerSample {
@@ -8,7 +9,9 @@ workflow PlotSVCountsPerSample {
     String prefix
     Array[File?] vcfs
     Int N_IQR_cutoff
+    String? bcftools_preprocessing_options
     String sv_pipeline_docker
+    RuntimeAttr? runtime_override_preprocess_vcf
     RuntimeAttr? runtime_attr_count_svs
     RuntimeAttr? runtime_attr_plot_svcounts
     RuntimeAttr? runtime_attr_cat_outliers_preview
@@ -16,10 +19,23 @@ workflow PlotSVCountsPerSample {
 
   scatter (vcf in vcfs) {
     if (defined(vcf)) {
-      String vcf_name = basename(select_first([vcf]), ".vcf.gz")
+      # Preprocess VCF with bcftools, if optioned
+      if (defined(bcftools_preprocessing_options)) {
+        call qc_utils.PreprocessVcf {
+          input:
+            vcf=select_first([vcf]),
+            prefix=basename(select_first([vcf]), ".vcf.gz") + ".preprocessed",
+            bcftools_preprocessing_options=select_first([bcftools_preprocessing_options]),
+            sv_base_mini_docker=sv_pipeline_docker,
+            runtime_attr_override=runtime_override_preprocess_vcf
+        }
+      }
+      File prepped_vcf = select_first([PreprocessVcf.outvcf, vcf])
+    
+      String vcf_name = basename(select_first([prepped_vcf]), ".vcf.gz")
       call CountSVsPerSamplePerType {
         input:
-          vcf = select_first([vcf]),
+          vcf = select_first([prepped_vcf]),
           prefix = vcf_name,
           sv_pipeline_docker = sv_pipeline_docker,
           runtime_attr_override = runtime_attr_count_svs

From da042c6f4410acc5e4a95768d6d762f8c205b829 Mon Sep 17 00:00:00 2001
From: Ryan Collins <rlcollins@g.harvard.edu>
Date: Mon, 9 Jan 2023 16:55:47 -0500
Subject: [PATCH 2/7] Add BCFTools preprocessing option to outlier sample
 exclusion

---
 wdl/FilterOutlierSamples.wdl   |  4 ++++
 wdl/IdentifyOutlierSamples.wdl | 22 +++++++++++++++++++---
 2 files changed, 23 insertions(+), 3 deletions(-)

diff --git a/wdl/FilterOutlierSamples.wdl b/wdl/FilterOutlierSamples.wdl
index df423292e..2b426b196 100644
--- a/wdl/FilterOutlierSamples.wdl
+++ b/wdl/FilterOutlierSamples.wdl
@@ -13,9 +13,11 @@ workflow FilterOutlierSamples {
     Int N_IQR_cutoff
     File? outlier_cutoff_table
     String? vcf_identifier  # required (enter algorithm here) if providing outlier_cutoff_table, otherwise used in some file prefixes
+    String? bcftools_preprocessing_options
     String sv_pipeline_docker
     String sv_base_mini_docker
     String linux_docker
+    RuntimeAttr? runtime_override_preprocess_vcf
     RuntimeAttr? runtime_attr_identify_outliers
     RuntimeAttr? runtime_attr_subset_vcf
     RuntimeAttr? runtime_attr_cat_outliers
@@ -32,8 +34,10 @@ workflow FilterOutlierSamples {
       N_IQR_cutoff = N_IQR_cutoff,
       outlier_cutoff_table = outlier_cutoff_table,
       vcf_identifier = vcf_identifier,
+      bcftools_preprocessing_options = bcftools_preprocessing_options,
       sv_pipeline_docker = sv_pipeline_docker,
       linux_docker = linux_docker,
+      runtime_override_preprocess_vcf = runtime_override_preprocess_vcf,
       runtime_attr_identify_outliers = runtime_attr_identify_outliers,
       runtime_attr_cat_outliers = runtime_attr_cat_outliers,
       runtime_attr_count_svs = runtime_attr_count_svs
diff --git a/wdl/IdentifyOutlierSamples.wdl b/wdl/IdentifyOutlierSamples.wdl
index 462c79746..afbd6d24b 100644
--- a/wdl/IdentifyOutlierSamples.wdl
+++ b/wdl/IdentifyOutlierSamples.wdl
@@ -2,6 +2,7 @@ version 1.0
 
 import "Structs.wdl"
 import "PlotSVCountsPerSample.wdl" as plot_svcounts
+import "CollectQcVcfWide.wdl" as qc_utils
 
 workflow IdentifyOutlierSamples {
   input {
@@ -11,8 +12,10 @@ workflow IdentifyOutlierSamples {
     Int N_IQR_cutoff
     File? outlier_cutoff_table
     String? vcf_identifier  # required (enter algorithm here) if providing outlier_cutoff_table, optional otherwise to add to file prefixes (ie. as a VCF identifier)
+    String? bcftools_preprocessing_options
     String sv_pipeline_docker
     String linux_docker
+    RuntimeAttr? runtime_override_preprocess_vcf
     RuntimeAttr? runtime_attr_identify_outliers
     RuntimeAttr? runtime_attr_cat_outliers
     RuntimeAttr? runtime_attr_count_svs
@@ -20,10 +23,23 @@ workflow IdentifyOutlierSamples {
 
   String prefix = if (defined(vcf_identifier)) then "~{name}_~{vcf_identifier}" else name
 
+  # Preprocess VCF with bcftools, if optioned
+  if (defined(bcftools_preprocessing_options)) {
+    call qc_utils.PreprocessVcf {
+      input:
+        vcf=vcf,
+        prefix=basename(vcf, ".vcf.gz") + ".preprocessed",
+        bcftools_preprocessing_options=select_first([bcftools_preprocessing_options]),
+        sv_base_mini_docker=sv_pipeline_docker,
+        runtime_attr_override=runtime_override_preprocess_vcf
+    }
+  }
+  File prepped_vcf = select_first([PreprocessVcf.outvcf, vcf])
+
   if (!defined(sv_counts)) {
     call plot_svcounts.CountSVsPerSamplePerType {
       input:
-        vcf = vcf,
+        vcf = prepped_vcf,
         prefix = prefix,
         sv_pipeline_docker = sv_pipeline_docker,
         runtime_attr_override = runtime_attr_count_svs
@@ -33,7 +49,7 @@ workflow IdentifyOutlierSamples {
   if (defined(outlier_cutoff_table)) {
     call IdentifyOutliersByCutoffTable {
       input:
-        vcf = vcf,
+        vcf = prepped_vcf,
         svcounts = select_first([sv_counts, CountSVsPerSamplePerType.sv_counts]),
         outlier_cutoff_table = select_first([outlier_cutoff_table]),
         outfile = "${prefix}_outliers.txt",
@@ -44,7 +60,7 @@ workflow IdentifyOutlierSamples {
   }
   call IdentifyOutliersByIQR {
     input:
-      vcf = vcf,
+      vcf = prepped_vcf,
       svcounts = select_first([sv_counts, CountSVsPerSamplePerType.sv_counts]),
       N_IQR_cutoff = N_IQR_cutoff,
       outfile = "${prefix}_IQR_outliers.txt",

From 9520b8cde07937d73b99b27fa8479fd4cb13003e Mon Sep 17 00:00:00 2001
From: Ryan Collins <rlcollins@g.harvard.edu>
Date: Mon, 9 Jan 2023 17:23:01 -0500
Subject: [PATCH 3/7] Output subsetted VCF index from outlier sample exclusion

---
 wdl/FilterOutlierSamples.wdl | 1 +
 1 file changed, 1 insertion(+)

diff --git a/wdl/FilterOutlierSamples.wdl b/wdl/FilterOutlierSamples.wdl
index 2b426b196..1ee0fb8a9 100644
--- a/wdl/FilterOutlierSamples.wdl
+++ b/wdl/FilterOutlierSamples.wdl
@@ -72,6 +72,7 @@ workflow FilterOutlierSamples {
 
   output {
     File outlier_filtered_vcf = SubsetVcfBySamplesList.vcf_subset
+    File outlier_filtered_vcf_idx = SubsetVcfBySamplesList.vcf_subset_index
     Array[String] filtered_samples_list = FilterSampleList.filtered_samples_list
     File filtered_samples_file = FilterSampleList.filtered_samples_file
     Array[String] outlier_samples_excluded = IdentifyOutlierSamples.outlier_samples_list

From 0f87632b388d51a865af5cc3af10e6515bd528d8 Mon Sep 17 00:00:00 2001
From: Ryan Collins <rlcollins@g.harvard.edu>
Date: Tue, 7 Feb 2023 17:01:11 -0500
Subject: [PATCH 4/7] Extend outlier exclusion workflow to accept multiple VCFs

---
 wdl/FilterOutlierSamples.wdl   | 31 +++++++++--------
 wdl/IdentifyOutlierSamples.wdl | 61 +++++++++++++++++++++-------------
 wdl/PlotSVCountsPerSample.wdl  |  2 +-
 3 files changed, 55 insertions(+), 39 deletions(-)

diff --git a/wdl/FilterOutlierSamples.wdl b/wdl/FilterOutlierSamples.wdl
index 1ee0fb8a9..80d38b6cf 100644
--- a/wdl/FilterOutlierSamples.wdl
+++ b/wdl/FilterOutlierSamples.wdl
@@ -4,11 +4,12 @@ import "Structs.wdl"
 import "Utils.wdl" as util
 import "IdentifyOutlierSamples.wdl" as identify_outliers
 
-# Filter outlier samples by IQR or cutoff table for a single VCF. Recommended to run PlotSVCountsPerSample first to choose cutoff
+# Filter outlier samples by IQR or cutoff table for one or more VCFs
+# Recommended to run PlotSVCountsPerSample first to choose cutoff
 workflow FilterOutlierSamples {
   input {
     String name  # batch or cohort
-    File vcf
+    Array[File] vcfs
     File? sv_counts  # SV counts file from PlotSVCountsPerSample - if not provided, will create
     Int N_IQR_cutoff
     File? outlier_cutoff_table
@@ -28,7 +29,7 @@ workflow FilterOutlierSamples {
 
   call identify_outliers.IdentifyOutlierSamples {
     input:
-      vcf = vcf,
+      vcfs = vcfs,
       name = name,
       sv_counts = sv_counts, 
       N_IQR_cutoff = N_IQR_cutoff,
@@ -43,19 +44,21 @@ workflow FilterOutlierSamples {
       runtime_attr_count_svs = runtime_attr_count_svs
   }
 
-  call util.SubsetVcfBySamplesList {
-    input:
-      vcf = vcf,
-      list_of_samples = IdentifyOutlierSamples.outlier_samples_file,
-      outfile_name = "${name}.outliers_removed.vcf.gz",
-      remove_samples = true,
-      sv_base_mini_docker = sv_base_mini_docker,
-      runtime_attr_override = runtime_attr_subset_vcf
+  scatter ( vcf in vcfs ) {
+    call util.SubsetVcfBySamplesList {
+      input:
+        vcf = vcf,
+        list_of_samples = IdentifyOutlierSamples.outlier_samples_file,
+        outfile_name = basename(vcf, ".vcf.gz") + ".${name}.outliers_removed.vcf.gz",
+        remove_samples = true,
+        sv_base_mini_docker = sv_base_mini_docker,
+        runtime_attr_override = runtime_attr_subset_vcf
+    }
   }
   
   call util.GetSampleIdsFromVcf {
     input:
-      vcf = vcf,
+      vcf = vcfs[0],
       sv_base_mini_docker = sv_base_mini_docker,
       runtime_attr_override = runtime_attr_ids_from_vcf
   }
@@ -71,8 +74,8 @@ workflow FilterOutlierSamples {
   }
 
   output {
-    File outlier_filtered_vcf = SubsetVcfBySamplesList.vcf_subset
-    File outlier_filtered_vcf_idx = SubsetVcfBySamplesList.vcf_subset_index
+    Array[File] outlier_filtered_vcfs = SubsetVcfBySamplesList.vcf_subset
+    Array[File] outlier_filtered_vcf_idxs = SubsetVcfBySamplesList.vcf_subset_index
     Array[String] filtered_samples_list = FilterSampleList.filtered_samples_list
     File filtered_samples_file = FilterSampleList.filtered_samples_file
     Array[String] outlier_samples_excluded = IdentifyOutlierSamples.outlier_samples_list
diff --git a/wdl/IdentifyOutlierSamples.wdl b/wdl/IdentifyOutlierSamples.wdl
index afbd6d24b..5a002cff3 100644
--- a/wdl/IdentifyOutlierSamples.wdl
+++ b/wdl/IdentifyOutlierSamples.wdl
@@ -3,11 +3,14 @@ version 1.0
 import "Structs.wdl"
 import "PlotSVCountsPerSample.wdl" as plot_svcounts
 import "CollectQcVcfWide.wdl" as qc_utils
+import "FilterOutlierSamplesPostMinGQ.wdl" as legacy
+
+# Identifies a list of SV count outlier samples from one or more input VCFs
 
 workflow IdentifyOutlierSamples {
   input {
     String name  # batch or cohort
-    File vcf
+    Array[File] vcfs
     File? sv_counts  # SV counts file from PlotSVCountsPerSample - if not provided, will create
     Int N_IQR_cutoff
     File? outlier_cutoff_table
@@ -23,34 +26,47 @@ workflow IdentifyOutlierSamples {
 
   String prefix = if (defined(vcf_identifier)) then "~{name}_~{vcf_identifier}" else name
 
-  # Preprocess VCF with bcftools, if optioned
-  if (defined(bcftools_preprocessing_options)) {
-    call qc_utils.PreprocessVcf {
-      input:
-        vcf=vcf,
-        prefix=basename(vcf, ".vcf.gz") + ".preprocessed",
-        bcftools_preprocessing_options=select_first([bcftools_preprocessing_options]),
-        sv_base_mini_docker=sv_pipeline_docker,
-        runtime_attr_override=runtime_override_preprocess_vcf
+  # Collect SV counts for each VCF in parallel inless sv_counts is provided
+  if (!defined(sv_counts)) {
+
+    # Process each VCF in parallel
+    scatter ( vcf in vcfs ) {
+      # Preprocess VCF with bcftools, if optioned
+      if (defined(bcftools_preprocessing_options)) {
+        call qc_utils.PreprocessVcf {
+          input:
+            vcf = vcf,
+            prefix = basename(vcf, ".vcf.gz") + ".preprocessed",
+            bcftools_preprocessing_options = select_first([bcftools_preprocessing_options]),
+            sv_base_mini_docker = sv_pipeline_docker,
+            runtime_attr_override = runtime_override_preprocess_vcf
+        }
+      }
+      File prepped_vcf = select_first([PreprocessVcf.outvcf, vcf])
+
+      call plot_svcounts.CountSVsPerSamplePerType {
+        input:
+          vcf = prepped_vcf,
+          prefix = prefix,
+          sv_pipeline_docker = sv_pipeline_docker,
+          runtime_attr_override = runtime_attr_count_svs
+      }
     }
-  }
-  File prepped_vcf = select_first([PreprocessVcf.outvcf, vcf])
 
-  if (!defined(sv_counts)) {
-    call plot_svcounts.CountSVsPerSamplePerType {
+    # Combine counts across all VCFs
+    call legacy.CombineCounts as Combine {
       input:
-        vcf = prepped_vcf,
+        svcounts = CountSVsPerSamplePerType.sv_counts,
         prefix = prefix,
-        sv_pipeline_docker = sv_pipeline_docker,
-        runtime_attr_override = runtime_attr_count_svs
+        sv_pipeline_docker = sv_pipeline_docker
     }
   }
+  File final_counts = select_first([sv_counts, Combine.summed_svcounts])
 
   if (defined(outlier_cutoff_table)) {
     call IdentifyOutliersByCutoffTable {
       input:
-        vcf = prepped_vcf,
-        svcounts = select_first([sv_counts, CountSVsPerSamplePerType.sv_counts]),
+        svcounts = final_counts,
         outlier_cutoff_table = select_first([outlier_cutoff_table]),
         outfile = "${prefix}_outliers.txt",
         algorithm = select_first([vcf_identifier]),
@@ -60,8 +76,7 @@ workflow IdentifyOutlierSamples {
   }
   call IdentifyOutliersByIQR {
     input:
-      vcf = prepped_vcf,
-      svcounts = select_first([sv_counts, CountSVsPerSamplePerType.sv_counts]),
+      svcounts = final_counts,
       N_IQR_cutoff = N_IQR_cutoff,
       outfile = "${prefix}_IQR_outliers.txt",
       sv_pipeline_docker = sv_pipeline_docker,
@@ -80,13 +95,12 @@ workflow IdentifyOutlierSamples {
   output {
     File outlier_samples_file = CatOutliers.outliers_file
     Array[String] outlier_samples_list = CatOutliers.outliers_list
-    File sv_counts_file = select_first([sv_counts, CountSVsPerSamplePerType.sv_counts])
+    File sv_counts_file = final_counts
   }
 }
 
 task IdentifyOutliersByIQR {
   input {
-    File vcf
     File svcounts
     Int N_IQR_cutoff
     String outfile
@@ -132,7 +146,6 @@ task IdentifyOutliersByIQR {
 
 task IdentifyOutliersByCutoffTable {
   input {
-    File vcf
     File svcounts
     File outlier_cutoff_table
     String outfile
diff --git a/wdl/PlotSVCountsPerSample.wdl b/wdl/PlotSVCountsPerSample.wdl
index 0c9edab16..180ae0a7c 100644
--- a/wdl/PlotSVCountsPerSample.wdl
+++ b/wdl/PlotSVCountsPerSample.wdl
@@ -3,7 +3,7 @@ version 1.0
 import "Structs.wdl"
 import "CollectQcVcfWide.wdl" as qc_utils
 
-# Workflow to count SVs per sample per type and 
+# Workflow to count SVs per sample per type
 workflow PlotSVCountsPerSample {
   input {
     String prefix

From 6442927c2c03f750bcec4b8aa39b29fcd99618c0 Mon Sep 17 00:00:00 2001
From: Ryan Collins <rlcollins@g.harvard.edu>
Date: Wed, 8 Feb 2023 11:20:05 -0500
Subject: [PATCH 5/7] Extend outlier exclusion workflow, v1

---
 .../sum_svcounts_perSample.R                  |  19 +--
 wdl/FilterOutlierSamples.wdl                  |  12 +-
 wdl/IdentifyOutlierSamples.wdl                | 119 ++++++++++++++++--
 3 files changed, 130 insertions(+), 20 deletions(-)

diff --git a/src/sv-pipeline/scripts/downstream_analysis_and_filtering/sum_svcounts_perSample.R b/src/sv-pipeline/scripts/downstream_analysis_and_filtering/sum_svcounts_perSample.R
index dfe02f91a..e49b8f129 100755
--- a/src/sv-pipeline/scripts/downstream_analysis_and_filtering/sum_svcounts_perSample.R
+++ b/src/sv-pipeline/scripts/downstream_analysis_and_filtering/sum_svcounts_perSample.R
@@ -4,26 +4,29 @@
 # in final_outlier_sample_filter.wdl
 
 
-###Set master parameters & read arguments
-options(stringsAsFactors=F,scipen=1000)
+###Set parameters & read arguments
+options(stringsAsFactors=F, scipen=1000)
 args <- commandArgs(trailingOnly=TRUE)
 INFILE <- args[1]
 OUTFILE <- args[2]
 
 ###Read input data & reformat
-dat <- read.table(INFILE,header=F)
-colnames(dat) <- c("sample","svtype","count","chrom")
+dat <- read.table(INFILE, header=F, check.names=F, sep="\t", comment.char="")
+if(dat[1, 1] %in% c("sample", "#sample")){
+  dat <- dat[-c(1), ]
+}
+colnames(dat) <- c("sample", "svtype", "count", "chrom")[1:ncol(dat)]
 samples <- as.character(unique(dat$sample))
 svtypes <- as.character(unique(dat$svtype))
 
 ###Get sum of counts per sample per svtype
-summed.res <- do.call("rbind", lapply(samples,function(sample){
-  return(do.call("rbind", lapply(svtypes,function(svtype){
+summed.res <- do.call("rbind", lapply(samples, function(sample){
+  return(do.call("rbind", lapply(svtypes, function(svtype){
     return(data.frame("sample"=sample,
                "svtype"=svtype,
-               "count"=sum(dat[which(dat$sample==sample & dat$svtype==svtype),]$count,na.rm=T)))
+               "count"=sum(dat[which(dat$sample==sample & dat$svtype==svtype), ]$count,na.rm=T)))
   })))
 }))
 
 ###Write summed results to outfile
-write.table(summed.res,OUTFILE,col.names=T,row.names=F,sep="\t",quote=F)
+write.table(summed.res, OUTFILE, col.names=T, row.names=F, sep="\t", quote=F)
diff --git a/wdl/FilterOutlierSamples.wdl b/wdl/FilterOutlierSamples.wdl
index 80d38b6cf..3e0fcfa68 100644
--- a/wdl/FilterOutlierSamples.wdl
+++ b/wdl/FilterOutlierSamples.wdl
@@ -15,6 +15,8 @@ workflow FilterOutlierSamples {
     File? outlier_cutoff_table
     String? vcf_identifier  # required (enter algorithm here) if providing outlier_cutoff_table, otherwise used in some file prefixes
     String? bcftools_preprocessing_options
+    Boolean plot_counts = false
+    Array[Pair[String, File]]? sample_subsets # if provided, will identify outliers separately within each subset. Expected format is array of pairs, where pair.left is the subset name and pair.right is a text file with all relevant sample IDs
     String sv_pipeline_docker
     String sv_base_mini_docker
     String linux_docker
@@ -22,9 +24,11 @@ workflow FilterOutlierSamples {
     RuntimeAttr? runtime_attr_identify_outliers
     RuntimeAttr? runtime_attr_subset_vcf
     RuntimeAttr? runtime_attr_cat_outliers
+    RuntimeAttr? runtime_attr_subset_counts
     RuntimeAttr? runtime_attr_filter_samples
     RuntimeAttr? runtime_attr_ids_from_vcf
     RuntimeAttr? runtime_attr_count_svs
+    RuntimeAttr? runtime_attr_plot_svcounts
   }
 
   call identify_outliers.IdentifyOutlierSamples {
@@ -36,12 +40,18 @@ workflow FilterOutlierSamples {
       outlier_cutoff_table = outlier_cutoff_table,
       vcf_identifier = vcf_identifier,
       bcftools_preprocessing_options = bcftools_preprocessing_options,
+      plot_counts = plot_counts,
+      sample_subsets = sample_subsets,
       sv_pipeline_docker = sv_pipeline_docker,
+      sv_base_mini_docker = sv_base_mini_docker,
       linux_docker = linux_docker,
+      runtime_attr_ids_from_vcf = runtime_attr_ids_from_vcf,
       runtime_override_preprocess_vcf = runtime_override_preprocess_vcf,
       runtime_attr_identify_outliers = runtime_attr_identify_outliers,
       runtime_attr_cat_outliers = runtime_attr_cat_outliers,
-      runtime_attr_count_svs = runtime_attr_count_svs
+      runtime_attr_subset_counts = runtime_attr_subset_counts,
+      runtime_attr_count_svs = runtime_attr_count_svs,
+      runtime_attr_plot_svcounts = runtime_attr_plot_svcounts
   }
 
   scatter ( vcf in vcfs ) {
diff --git a/wdl/IdentifyOutlierSamples.wdl b/wdl/IdentifyOutlierSamples.wdl
index 5a002cff3..87beb8bdd 100644
--- a/wdl/IdentifyOutlierSamples.wdl
+++ b/wdl/IdentifyOutlierSamples.wdl
@@ -2,6 +2,7 @@ version 1.0
 
 import "Structs.wdl"
 import "PlotSVCountsPerSample.wdl" as plot_svcounts
+import "Utils.wdl" as util
 import "CollectQcVcfWide.wdl" as qc_utils
 import "FilterOutlierSamplesPostMinGQ.wdl" as legacy
 
@@ -16,17 +17,33 @@ workflow IdentifyOutlierSamples {
     File? outlier_cutoff_table
     String? vcf_identifier  # required (enter algorithm here) if providing outlier_cutoff_table, optional otherwise to add to file prefixes (ie. as a VCF identifier)
     String? bcftools_preprocessing_options
+    Boolean plot_counts = false
+    Array[Pair[String, File]]? sample_subsets # if provided, will identify outliers separately within each subset. Expected format is array of pairs, where pair.left is the subset name and pair.right is a text file with all relevant sample IDs
     String sv_pipeline_docker
+    String sv_base_mini_docker
     String linux_docker
+    RuntimeAttr? runtime_attr_ids_from_vcf
     RuntimeAttr? runtime_override_preprocess_vcf
     RuntimeAttr? runtime_attr_identify_outliers
     RuntimeAttr? runtime_attr_cat_outliers
+    RuntimeAttr? runtime_attr_subset_counts
     RuntimeAttr? runtime_attr_count_svs
+    RuntimeAttr? runtime_attr_plot_svcounts
   }
 
   String prefix = if (defined(vcf_identifier)) then "~{name}_~{vcf_identifier}" else name
 
-  # Collect SV counts for each VCF in parallel inless sv_counts is provided
+  if (!defined(sample_subsets)) {
+    call util.GetSampleIdsFromVcf as GetSamplesList {
+      input:
+        vcf = vcfs[0],
+        sv_base_mini_docker = sv_base_mini_docker,
+        runtime_attr_override = runtime_attr_ids_from_vcf
+    }
+  }
+  Array[Pair[String, File]] subsets_to_eval = select_first([sample_subsets, [("ALL", select_first([GetSamplesList.out_file]))]])
+
+  # Collect SV counts for each VCF in parallel unless sv_counts is provided
   if (!defined(sv_counts)) {
 
     # Process each VCF in parallel
@@ -44,7 +61,7 @@ workflow IdentifyOutlierSamples {
       }
       File prepped_vcf = select_first([PreprocessVcf.outvcf, vcf])
 
-      call plot_svcounts.CountSVsPerSamplePerType {
+      call plot_svcounts.CountSVsPerSamplePerType as CountPerVcf {
         input:
           vcf = prepped_vcf,
           prefix = prefix,
@@ -56,13 +73,14 @@ workflow IdentifyOutlierSamples {
     # Combine counts across all VCFs
     call legacy.CombineCounts as Combine {
       input:
-        svcounts = CountSVsPerSamplePerType.sv_counts,
+        svcounts = CountPerVcf.sv_counts,
         prefix = prefix,
         sv_pipeline_docker = sv_pipeline_docker
     }
   }
   File final_counts = select_first([sv_counts, Combine.summed_svcounts])
 
+  # If a precomputed outlier table is provided, directly apply those cutoffs
   if (defined(outlier_cutoff_table)) {
     call IdentifyOutliersByCutoffTable {
       input:
@@ -74,19 +92,53 @@ workflow IdentifyOutlierSamples {
         runtime_attr_override = runtime_attr_identify_outliers
     }
   }
-  call IdentifyOutliersByIQR {
-    input:
-      svcounts = final_counts,
-      N_IQR_cutoff = N_IQR_cutoff,
-      outfile = "${prefix}_IQR_outliers.txt",
-      sv_pipeline_docker = sv_pipeline_docker,
-      runtime_attr_override = runtime_attr_identify_outliers
+  # Otherwise, dynamically determine outliers based on N_IQR
+  if (!defined(outlier_cutoff_table)) {
+
+    # Determine outliers for each sample subset, if optioned (e.g., PCR+ vs. PCR-)
+    scatter ( subset_info in subsets_to_eval ) {
+      call SubsetCounts {
+        input:
+          svcounts = final_counts,
+          samples_list = subset_info.right,
+          outfile = "${prefix}.${subset_info.left}.counts.tsv",
+          linux_docker = linux_docker,
+          runtime_attr_override = runtime_attr_subset_counts
+      }
+      call IdentifyOutliersByIQR {
+        input:
+          svcounts = SubsetCounts.counts_subset,
+          N_IQR_cutoff = N_IQR_cutoff,
+          outfile = "${prefix}.${subset_info.left}.IQR_outliers.txt",
+          sv_pipeline_docker = sv_pipeline_docker,
+          runtime_attr_override = runtime_attr_identify_outliers
+      }
+      if (plot_counts) {
+        call plot_svcounts.PlotSVCountsWithCutoff as PlotCountsPerSubset {
+            input:
+              svcounts = SubsetCounts.counts_subset,
+              n_iqr_cutoff = N_IQR_cutoff,
+              prefix = "${prefix}.${subset_info.left}",
+              sv_pipeline_docker = sv_pipeline_docker,
+              runtime_attr_override = runtime_attr_plot_svcounts
+        }
+      }
+    }
+
+    # Merge outlier lists across all subsets
+    call CatOutliers as CatSubsets {
+      input:
+        outliers = select_all(IdentifyOutliersByIQR.outliers_list),
+        batch = prefix,
+        linux_docker = linux_docker,
+        runtime_attr_override = runtime_attr_cat_outliers
+    }
   }
 
   # Merge list of outliers
   call CatOutliers {
     input:
-      outliers = select_all([IdentifyOutliersByIQR.outliers_list,IdentifyOutliersByCutoffTable.outliers_list]),
+      outliers = select_all([CatSubsets.outliers_file, IdentifyOutliersByCutoffTable.outliers_list]),
       batch = prefix,
       linux_docker = linux_docker,
       runtime_attr_override = runtime_attr_cat_outliers
@@ -96,6 +148,7 @@ workflow IdentifyOutlierSamples {
     File outlier_samples_file = CatOutliers.outliers_file
     Array[String] outlier_samples_list = CatOutliers.outliers_list
     File sv_counts_file = final_counts
+    Array[File?]? outlier_plots = PlotCountsPerSubset.svcount_distrib_plots
   }
 }
 
@@ -191,6 +244,50 @@ task IdentifyOutliersByCutoffTable {
 
 }
 
+# Restrict a file of SV counts per sample to a subset of samples
+task SubsetCounts {
+  input {
+    File svcounts
+    File samples_list
+    String outfile
+    String linux_docker
+    RuntimeAttr? runtime_attr_override
+  }
+
+  RuntimeAttr default_attr = object {
+    cpu_cores: 1, 
+    mem_gb: 3.75, 
+    disk_gb: 10,
+    boot_disk_gb: 10,
+    preemptible_tries: 3,
+    max_retries: 1
+  }
+  RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr])
+
+  output {
+    File counts_subset = "${outfile}"
+  }
+
+  command <<<
+
+    set -euo pipefail
+    head -n1 ~{svcounts} > ~{outfile}
+    fgrep -wf ~{samples_list} ~{svcounts} >> ~{outfile}
+  
+  >>>
+
+  runtime {
+    cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores])
+    memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB"
+    disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD"
+    bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb])
+    docker: linux_docker
+    preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries])
+    maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries])
+  }
+
+}
+
 
 # Merge outlier sample lists across algorithms
 task CatOutliers {

From c9b332fa6a37f53f5a01782ec8a1e8a1e16369d0 Mon Sep 17 00:00:00 2001
From: Ryan Collins <rlcollins@g.harvard.edu>
Date: Thu, 9 Feb 2023 11:04:59 -0500
Subject: [PATCH 6/7] Fix header issues when combining SV counts from multiple
 VCFs

---
 inputs/values/dockers.json                    |  2 +-
 .../sum_svcounts_perSample.R                  | 23 +++++++++++--------
 wdl/FilterOutlierSamples.wdl                  | 10 +++++++-
 wdl/IdentifyOutlierSamples.wdl                |  6 +++--
 4 files changed, 27 insertions(+), 14 deletions(-)

diff --git a/inputs/values/dockers.json b/inputs/values/dockers.json
index 8448074b6..8d20e0d3e 100644
--- a/inputs/values/dockers.json
+++ b/inputs/values/dockers.json
@@ -33,4 +33,4 @@
   "sv_utils_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-utils:2023-02-01-v0.26.8-beta-9b25c72d",
   "gq_recalibrator_docker": "us.gcr.io/broad-dsde-methods/tbrookin/gatk:0a7e1d86f",
   "str": "us.gcr.io/broad-dsde-methods/vjalili/str:5994670"
-}
\ No newline at end of file
+}
diff --git a/src/sv-pipeline/scripts/downstream_analysis_and_filtering/sum_svcounts_perSample.R b/src/sv-pipeline/scripts/downstream_analysis_and_filtering/sum_svcounts_perSample.R
index e49b8f129..8989a6b2d 100755
--- a/src/sv-pipeline/scripts/downstream_analysis_and_filtering/sum_svcounts_perSample.R
+++ b/src/sv-pipeline/scripts/downstream_analysis_and_filtering/sum_svcounts_perSample.R
@@ -4,29 +4,32 @@
 # in final_outlier_sample_filter.wdl
 
 
-###Set parameters & read arguments
+### Set parameters & read arguments
 options(stringsAsFactors=F, scipen=1000)
 args <- commandArgs(trailingOnly=TRUE)
 INFILE <- args[1]
 OUTFILE <- args[2]
 
-###Read input data & reformat
+### Read input data & reformat
 dat <- read.table(INFILE, header=F, check.names=F, sep="\t", comment.char="")
-if(dat[1, 1] %in% c("sample", "#sample")){
-  dat <- dat[-c(1), ]
+drop.rows <- which(dat[, 1] %in% c("sample", "#sample"))
+if(length(drop.rows) > 0){
+  dat <- dat[-drop.rows, ]
 }
 colnames(dat) <- c("sample", "svtype", "count", "chrom")[1:ncol(dat)]
+dat$count <- as.numeric(dat$count)
 samples <- as.character(unique(dat$sample))
 svtypes <- as.character(unique(dat$svtype))
 
-###Get sum of counts per sample per svtype
+### Get sum of counts per sample per svtype
 summed.res <- do.call("rbind", lapply(samples, function(sample){
-  return(do.call("rbind", lapply(svtypes, function(svtype){
-    return(data.frame("sample"=sample,
+  do.call("rbind", lapply(svtypes, function(svtype){
+    data.frame("sample"=sample,
                "svtype"=svtype,
-               "count"=sum(dat[which(dat$sample==sample & dat$svtype==svtype), ]$count,na.rm=T)))
-  })))
+               "count"=sum(dat[which(dat$sample==sample & dat$svtype==svtype), ]$count,
+                           na.rm=T))
+  }))
 }))
 
-###Write summed results to outfile
+### Write summed results to outfile
 write.table(summed.res, OUTFILE, col.names=T, row.names=F, sep="\t", quote=F)
diff --git a/wdl/FilterOutlierSamples.wdl b/wdl/FilterOutlierSamples.wdl
index 3e0fcfa68..93be211e6 100644
--- a/wdl/FilterOutlierSamples.wdl
+++ b/wdl/FilterOutlierSamples.wdl
@@ -16,7 +16,8 @@ workflow FilterOutlierSamples {
     String? vcf_identifier  # required (enter algorithm here) if providing outlier_cutoff_table, otherwise used in some file prefixes
     String? bcftools_preprocessing_options
     Boolean plot_counts = false
-    Array[Pair[String, File]]? sample_subsets # if provided, will identify outliers separately within each subset. Expected format is array of pairs, where pair.left is the subset name and pair.right is a text file with all relevant sample IDs
+    Array[String]? sample_subset_prefixes # if provided, will identify outliers separately within each subset
+    Array[String]? sample_subset_lists # if provided, will identify outliers separately within each subset
     String sv_pipeline_docker
     String sv_base_mini_docker
     String linux_docker
@@ -28,9 +29,15 @@ workflow FilterOutlierSamples {
     RuntimeAttr? runtime_attr_filter_samples
     RuntimeAttr? runtime_attr_ids_from_vcf
     RuntimeAttr? runtime_attr_count_svs
+    RuntimeAttr? runtime_attr_combine_counts
     RuntimeAttr? runtime_attr_plot_svcounts
   }
 
+  if (defined(sample_subset_prefixes) && defined(sample_subset_lists)) {
+    Array[Pair[String, File]]? sample_subsets = zip(select_first([sample_subset_prefixes]), 
+                                                    select_first([sample_subset_lists]))
+  }
+
   call identify_outliers.IdentifyOutlierSamples {
     input:
       vcfs = vcfs,
@@ -51,6 +58,7 @@ workflow FilterOutlierSamples {
       runtime_attr_cat_outliers = runtime_attr_cat_outliers,
       runtime_attr_subset_counts = runtime_attr_subset_counts,
       runtime_attr_count_svs = runtime_attr_count_svs,
+      runtime_attr_combine_counts = runtime_attr_combine_counts,
       runtime_attr_plot_svcounts = runtime_attr_plot_svcounts
   }
 
diff --git a/wdl/IdentifyOutlierSamples.wdl b/wdl/IdentifyOutlierSamples.wdl
index 87beb8bdd..83cbe10d5 100644
--- a/wdl/IdentifyOutlierSamples.wdl
+++ b/wdl/IdentifyOutlierSamples.wdl
@@ -28,6 +28,7 @@ workflow IdentifyOutlierSamples {
     RuntimeAttr? runtime_attr_cat_outliers
     RuntimeAttr? runtime_attr_subset_counts
     RuntimeAttr? runtime_attr_count_svs
+    RuntimeAttr? runtime_attr_combine_counts
     RuntimeAttr? runtime_attr_plot_svcounts
   }
 
@@ -41,7 +42,7 @@ workflow IdentifyOutlierSamples {
         runtime_attr_override = runtime_attr_ids_from_vcf
     }
   }
-  Array[Pair[String, File]] subsets_to_eval = select_first([sample_subsets, [("ALL", select_first([GetSamplesList.out_file]))]])
+  Array[Pair[String, File]] subsets_to_eval = select_first([sample_subsets, [("ALL", select_all([GetSamplesList.out_file]))]])
 
   # Collect SV counts for each VCF in parallel unless sv_counts is provided
   if (!defined(sv_counts)) {
@@ -75,7 +76,8 @@ workflow IdentifyOutlierSamples {
       input:
         svcounts = CountPerVcf.sv_counts,
         prefix = prefix,
-        sv_pipeline_docker = sv_pipeline_docker
+        sv_pipeline_docker = sv_pipeline_docker,
+        runtime_attr_override = runtime_attr_combine_counts
     }
   }
   File final_counts = select_first([sv_counts, Combine.summed_svcounts])

From cdca40801c8ef625983db442b5b788b11b05851d Mon Sep 17 00:00:00 2001
From: Ryan Collins <rlcollins@g.harvard.edu>
Date: Sat, 11 Feb 2023 07:54:32 -0500
Subject: [PATCH 7/7] Final update before PR

---
 wdl/FilterOutlierSamples.wdl   |   2 +
 wdl/IdentifyOutlierSamples.wdl | 110 +++++++++++++++++++++++++++------
 2 files changed, 93 insertions(+), 19 deletions(-)

diff --git a/wdl/FilterOutlierSamples.wdl b/wdl/FilterOutlierSamples.wdl
index 93be211e6..3c54fce5d 100644
--- a/wdl/FilterOutlierSamples.wdl
+++ b/wdl/FilterOutlierSamples.wdl
@@ -18,6 +18,7 @@ workflow FilterOutlierSamples {
     Boolean plot_counts = false
     Array[String]? sample_subset_prefixes # if provided, will identify outliers separately within each subset
     Array[String]? sample_subset_lists # if provided, will identify outliers separately within each subset
+    Int samples_per_shard = 5000
     String sv_pipeline_docker
     String sv_base_mini_docker
     String linux_docker
@@ -49,6 +50,7 @@ workflow FilterOutlierSamples {
       bcftools_preprocessing_options = bcftools_preprocessing_options,
       plot_counts = plot_counts,
       sample_subsets = sample_subsets,
+      samples_per_shard = samples_per_shard,
       sv_pipeline_docker = sv_pipeline_docker,
       sv_base_mini_docker = sv_base_mini_docker,
       linux_docker = linux_docker,
diff --git a/wdl/IdentifyOutlierSamples.wdl b/wdl/IdentifyOutlierSamples.wdl
index 83cbe10d5..273670023 100644
--- a/wdl/IdentifyOutlierSamples.wdl
+++ b/wdl/IdentifyOutlierSamples.wdl
@@ -3,6 +3,7 @@ version 1.0
 import "Structs.wdl"
 import "PlotSVCountsPerSample.wdl" as plot_svcounts
 import "Utils.wdl" as util
+import "TasksMakeCohortVcf.wdl" as cohort_utils
 import "CollectQcVcfWide.wdl" as qc_utils
 import "FilterOutlierSamplesPostMinGQ.wdl" as legacy
 
@@ -19,6 +20,7 @@ workflow IdentifyOutlierSamples {
     String? bcftools_preprocessing_options
     Boolean plot_counts = false
     Array[Pair[String, File]]? sample_subsets # if provided, will identify outliers separately within each subset. Expected format is array of pairs, where pair.left is the subset name and pair.right is a text file with all relevant sample IDs
+    Int samples_per_shard = 5000
     String sv_pipeline_docker
     String sv_base_mini_docker
     String linux_docker
@@ -34,15 +36,14 @@ workflow IdentifyOutlierSamples {
 
   String prefix = if (defined(vcf_identifier)) then "~{name}_~{vcf_identifier}" else name
 
-  if (!defined(sample_subsets)) {
-    call util.GetSampleIdsFromVcf as GetSamplesList {
-      input:
-        vcf = vcfs[0],
-        sv_base_mini_docker = sv_base_mini_docker,
-        runtime_attr_override = runtime_attr_ids_from_vcf
-    }
+  call util.GetSampleIdsFromVcf as GetSamplesList {
+    input:
+      vcf = vcfs[0],
+      sv_base_mini_docker = sv_base_mini_docker,
+      runtime_attr_override = runtime_attr_ids_from_vcf
   }
-  Array[Pair[String, File]] subsets_to_eval = select_first([sample_subsets, [("ALL", select_all([GetSamplesList.out_file]))]])
+  Array[Pair[String, File]] default_subsets = [("ALL", GetSamplesList.out_file)]
+  Array[Pair[String, File]] subsets_to_eval = select_first([sample_subsets, default_subsets])
 
   # Collect SV counts for each VCF in parallel unless sv_counts is provided
   if (!defined(sv_counts)) {
@@ -71,16 +72,41 @@ workflow IdentifyOutlierSamples {
       }
     }
 
-    # Combine counts across all VCFs
-    call legacy.CombineCounts as Combine {
+    # Combine counts across all VCFs (scattered over sample chunks)
+    call cohort_utils.SplitUncompressed as ShardSamples {
       input:
-        svcounts = CountPerVcf.sv_counts,
-        prefix = prefix,
-        sv_pipeline_docker = sv_pipeline_docker,
-        runtime_attr_override = runtime_attr_combine_counts
+        whole_file = GetSamplesList.out_file,
+        lines_per_shard = samples_per_shard,
+        shard_prefix = prefix,
+        shuffle_file = true,
+        random_seed = 2023,
+        sv_pipeline_docker = sv_pipeline_docker
+    }
+    scatter ( sample_shard in ShardSamples.shards ) {
+      call SubsetCounts as SubsetPreCombine {
+        input:
+          svcounts = CountPerVcf.sv_counts,
+          samples_list = sample_shard,
+          outfile = "${prefix}.shard.counts.tsv",
+          linux_docker = linux_docker,
+          runtime_attr_override = runtime_attr_subset_counts
+      }
+      call legacy.CombineCounts as CombineShard {
+        input:
+          svcounts = [SubsetPreCombine.counts_subset],
+          prefix = prefix,
+          sv_pipeline_docker = sv_pipeline_docker,
+          runtime_attr_override = runtime_attr_combine_counts
+      }
+    }
+    call CatCounts as Combine {
+      input:
+        svcounts = CombineShard.summed_svcounts,
+        outfile = "${prefix}.merged.counts.tsv",
+        linux_docker = linux_docker
     }
   }
-  File final_counts = select_first([sv_counts, Combine.summed_svcounts])
+  File final_counts = select_first([sv_counts, Combine.merged_counts])
 
   # If a precomputed outlier table is provided, directly apply those cutoffs
   if (defined(outlier_cutoff_table)) {
@@ -101,7 +127,7 @@ workflow IdentifyOutlierSamples {
     scatter ( subset_info in subsets_to_eval ) {
       call SubsetCounts {
         input:
-          svcounts = final_counts,
+          svcounts = [final_counts],
           samples_list = subset_info.right,
           outfile = "${prefix}.${subset_info.left}.counts.tsv",
           linux_docker = linux_docker,
@@ -249,7 +275,7 @@ task IdentifyOutliersByCutoffTable {
 # Restrict a file of SV counts per sample to a subset of samples
 task SubsetCounts {
   input {
-    File svcounts
+    Array[File] svcounts
     File samples_list
     String outfile
     String linux_docker
@@ -273,8 +299,10 @@ task SubsetCounts {
   command <<<
 
     set -euo pipefail
-    head -n1 ~{svcounts} > ~{outfile}
-    fgrep -wf ~{samples_list} ~{svcounts} >> ~{outfile}
+    head -n1 ~{svcounts[0]} > ~{outfile}
+    for file in ~{sep=" " svcounts}; do
+      fgrep -wf ~{samples_list} "$file" >> ~{outfile}
+    done
   
   >>>
 
@@ -290,6 +318,50 @@ task SubsetCounts {
 
 }
 
+# Naive concatenation of multiple counts files while accounting for header
+task CatCounts {
+  input {
+    Array[File] svcounts
+    String outfile
+    String linux_docker
+    RuntimeAttr? runtime_attr_override
+  }
+
+  RuntimeAttr default_attr = object {
+    cpu_cores: 1, 
+    mem_gb: 3.75, 
+    disk_gb: 10,
+    boot_disk_gb: 10,
+    preemptible_tries: 3,
+    max_retries: 1
+  }
+  RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr])
+
+  output {
+    File merged_counts = "${outfile}"
+  }
+
+  command <<<
+
+    set -euo pipefail
+    head -n1 ~{svcounts[0]} > ~{outfile}
+    for file in ~{sep=" " svcounts}; do
+      cat "$file" | sed '1d' >> ~{outfile}
+    done
+  
+  >>>
+
+  runtime {
+    cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores])
+    memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB"
+    disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD"
+    bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb])
+    docker: linux_docker
+    preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries])
+    maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries])
+  }
+
+}
 
 # Merge outlier sample lists across algorithms
 task CatOutliers {