From aff60eef2febea8eb7f48538ca1b5548c9445135 Mon Sep 17 00:00:00 2001 From: Ryan Collins Date: Mon, 9 Jan 2023 16:29:31 -0500 Subject: [PATCH 1/7] Add BCFTools preprocessing option to PlotSVCountsPerSample --- wdl/PlotSVCountsPerSample.wdl | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/wdl/PlotSVCountsPerSample.wdl b/wdl/PlotSVCountsPerSample.wdl index 65602920f..0c9edab16 100644 --- a/wdl/PlotSVCountsPerSample.wdl +++ b/wdl/PlotSVCountsPerSample.wdl @@ -1,6 +1,7 @@ version 1.0 import "Structs.wdl" +import "CollectQcVcfWide.wdl" as qc_utils # Workflow to count SVs per sample per type and workflow PlotSVCountsPerSample { @@ -8,7 +9,9 @@ workflow PlotSVCountsPerSample { String prefix Array[File?] vcfs Int N_IQR_cutoff + String? bcftools_preprocessing_options String sv_pipeline_docker + RuntimeAttr? runtime_override_preprocess_vcf RuntimeAttr? runtime_attr_count_svs RuntimeAttr? runtime_attr_plot_svcounts RuntimeAttr? runtime_attr_cat_outliers_preview @@ -16,10 +19,23 @@ workflow PlotSVCountsPerSample { scatter (vcf in vcfs) { if (defined(vcf)) { - String vcf_name = basename(select_first([vcf]), ".vcf.gz") + # Preprocess VCF with bcftools, if optioned + if (defined(bcftools_preprocessing_options)) { + call qc_utils.PreprocessVcf { + input: + vcf=select_first([vcf]), + prefix=basename(select_first([vcf]), ".vcf.gz") + ".preprocessed", + bcftools_preprocessing_options=select_first([bcftools_preprocessing_options]), + sv_base_mini_docker=sv_pipeline_docker, + runtime_attr_override=runtime_override_preprocess_vcf + } + } + File prepped_vcf = select_first([PreprocessVcf.outvcf, vcf]) + + String vcf_name = basename(select_first([prepped_vcf]), ".vcf.gz") call CountSVsPerSamplePerType { input: - vcf = select_first([vcf]), + vcf = select_first([prepped_vcf]), prefix = vcf_name, sv_pipeline_docker = sv_pipeline_docker, runtime_attr_override = runtime_attr_count_svs From da042c6f4410acc5e4a95768d6d762f8c205b829 Mon Sep 17 00:00:00 2001 From: Ryan Collins Date: Mon, 9 Jan 2023 16:55:47 -0500 Subject: [PATCH 2/7] Add BCFTools preprocessing option to outlier sample exclusion --- wdl/FilterOutlierSamples.wdl | 4 ++++ wdl/IdentifyOutlierSamples.wdl | 22 +++++++++++++++++++--- 2 files changed, 23 insertions(+), 3 deletions(-) diff --git a/wdl/FilterOutlierSamples.wdl b/wdl/FilterOutlierSamples.wdl index df423292e..2b426b196 100644 --- a/wdl/FilterOutlierSamples.wdl +++ b/wdl/FilterOutlierSamples.wdl @@ -13,9 +13,11 @@ workflow FilterOutlierSamples { Int N_IQR_cutoff File? outlier_cutoff_table String? vcf_identifier # required (enter algorithm here) if providing outlier_cutoff_table, otherwise used in some file prefixes + String? bcftools_preprocessing_options String sv_pipeline_docker String sv_base_mini_docker String linux_docker + RuntimeAttr? runtime_override_preprocess_vcf RuntimeAttr? runtime_attr_identify_outliers RuntimeAttr? runtime_attr_subset_vcf RuntimeAttr? runtime_attr_cat_outliers @@ -32,8 +34,10 @@ workflow FilterOutlierSamples { N_IQR_cutoff = N_IQR_cutoff, outlier_cutoff_table = outlier_cutoff_table, vcf_identifier = vcf_identifier, + bcftools_preprocessing_options = bcftools_preprocessing_options, sv_pipeline_docker = sv_pipeline_docker, linux_docker = linux_docker, + runtime_override_preprocess_vcf = runtime_override_preprocess_vcf, runtime_attr_identify_outliers = runtime_attr_identify_outliers, runtime_attr_cat_outliers = runtime_attr_cat_outliers, runtime_attr_count_svs = runtime_attr_count_svs diff --git a/wdl/IdentifyOutlierSamples.wdl b/wdl/IdentifyOutlierSamples.wdl index 462c79746..afbd6d24b 100644 --- a/wdl/IdentifyOutlierSamples.wdl +++ b/wdl/IdentifyOutlierSamples.wdl @@ -2,6 +2,7 @@ version 1.0 import "Structs.wdl" import "PlotSVCountsPerSample.wdl" as plot_svcounts +import "CollectQcVcfWide.wdl" as qc_utils workflow IdentifyOutlierSamples { input { @@ -11,8 +12,10 @@ workflow IdentifyOutlierSamples { Int N_IQR_cutoff File? outlier_cutoff_table String? vcf_identifier # required (enter algorithm here) if providing outlier_cutoff_table, optional otherwise to add to file prefixes (ie. as a VCF identifier) + String? bcftools_preprocessing_options String sv_pipeline_docker String linux_docker + RuntimeAttr? runtime_override_preprocess_vcf RuntimeAttr? runtime_attr_identify_outliers RuntimeAttr? runtime_attr_cat_outliers RuntimeAttr? runtime_attr_count_svs @@ -20,10 +23,23 @@ workflow IdentifyOutlierSamples { String prefix = if (defined(vcf_identifier)) then "~{name}_~{vcf_identifier}" else name + # Preprocess VCF with bcftools, if optioned + if (defined(bcftools_preprocessing_options)) { + call qc_utils.PreprocessVcf { + input: + vcf=vcf, + prefix=basename(vcf, ".vcf.gz") + ".preprocessed", + bcftools_preprocessing_options=select_first([bcftools_preprocessing_options]), + sv_base_mini_docker=sv_pipeline_docker, + runtime_attr_override=runtime_override_preprocess_vcf + } + } + File prepped_vcf = select_first([PreprocessVcf.outvcf, vcf]) + if (!defined(sv_counts)) { call plot_svcounts.CountSVsPerSamplePerType { input: - vcf = vcf, + vcf = prepped_vcf, prefix = prefix, sv_pipeline_docker = sv_pipeline_docker, runtime_attr_override = runtime_attr_count_svs @@ -33,7 +49,7 @@ workflow IdentifyOutlierSamples { if (defined(outlier_cutoff_table)) { call IdentifyOutliersByCutoffTable { input: - vcf = vcf, + vcf = prepped_vcf, svcounts = select_first([sv_counts, CountSVsPerSamplePerType.sv_counts]), outlier_cutoff_table = select_first([outlier_cutoff_table]), outfile = "${prefix}_outliers.txt", @@ -44,7 +60,7 @@ workflow IdentifyOutlierSamples { } call IdentifyOutliersByIQR { input: - vcf = vcf, + vcf = prepped_vcf, svcounts = select_first([sv_counts, CountSVsPerSamplePerType.sv_counts]), N_IQR_cutoff = N_IQR_cutoff, outfile = "${prefix}_IQR_outliers.txt", From 9520b8cde07937d73b99b27fa8479fd4cb13003e Mon Sep 17 00:00:00 2001 From: Ryan Collins Date: Mon, 9 Jan 2023 17:23:01 -0500 Subject: [PATCH 3/7] Output subsetted VCF index from outlier sample exclusion --- wdl/FilterOutlierSamples.wdl | 1 + 1 file changed, 1 insertion(+) diff --git a/wdl/FilterOutlierSamples.wdl b/wdl/FilterOutlierSamples.wdl index 2b426b196..1ee0fb8a9 100644 --- a/wdl/FilterOutlierSamples.wdl +++ b/wdl/FilterOutlierSamples.wdl @@ -72,6 +72,7 @@ workflow FilterOutlierSamples { output { File outlier_filtered_vcf = SubsetVcfBySamplesList.vcf_subset + File outlier_filtered_vcf_idx = SubsetVcfBySamplesList.vcf_subset_index Array[String] filtered_samples_list = FilterSampleList.filtered_samples_list File filtered_samples_file = FilterSampleList.filtered_samples_file Array[String] outlier_samples_excluded = IdentifyOutlierSamples.outlier_samples_list From 0f87632b388d51a865af5cc3af10e6515bd528d8 Mon Sep 17 00:00:00 2001 From: Ryan Collins Date: Tue, 7 Feb 2023 17:01:11 -0500 Subject: [PATCH 4/7] Extend outlier exclusion workflow to accept multiple VCFs --- wdl/FilterOutlierSamples.wdl | 31 +++++++++-------- wdl/IdentifyOutlierSamples.wdl | 61 +++++++++++++++++++++------------- wdl/PlotSVCountsPerSample.wdl | 2 +- 3 files changed, 55 insertions(+), 39 deletions(-) diff --git a/wdl/FilterOutlierSamples.wdl b/wdl/FilterOutlierSamples.wdl index 1ee0fb8a9..80d38b6cf 100644 --- a/wdl/FilterOutlierSamples.wdl +++ b/wdl/FilterOutlierSamples.wdl @@ -4,11 +4,12 @@ import "Structs.wdl" import "Utils.wdl" as util import "IdentifyOutlierSamples.wdl" as identify_outliers -# Filter outlier samples by IQR or cutoff table for a single VCF. Recommended to run PlotSVCountsPerSample first to choose cutoff +# Filter outlier samples by IQR or cutoff table for one or more VCFs +# Recommended to run PlotSVCountsPerSample first to choose cutoff workflow FilterOutlierSamples { input { String name # batch or cohort - File vcf + Array[File] vcfs File? sv_counts # SV counts file from PlotSVCountsPerSample - if not provided, will create Int N_IQR_cutoff File? outlier_cutoff_table @@ -28,7 +29,7 @@ workflow FilterOutlierSamples { call identify_outliers.IdentifyOutlierSamples { input: - vcf = vcf, + vcfs = vcfs, name = name, sv_counts = sv_counts, N_IQR_cutoff = N_IQR_cutoff, @@ -43,19 +44,21 @@ workflow FilterOutlierSamples { runtime_attr_count_svs = runtime_attr_count_svs } - call util.SubsetVcfBySamplesList { - input: - vcf = vcf, - list_of_samples = IdentifyOutlierSamples.outlier_samples_file, - outfile_name = "${name}.outliers_removed.vcf.gz", - remove_samples = true, - sv_base_mini_docker = sv_base_mini_docker, - runtime_attr_override = runtime_attr_subset_vcf + scatter ( vcf in vcfs ) { + call util.SubsetVcfBySamplesList { + input: + vcf = vcf, + list_of_samples = IdentifyOutlierSamples.outlier_samples_file, + outfile_name = basename(vcf, ".vcf.gz") + ".${name}.outliers_removed.vcf.gz", + remove_samples = true, + sv_base_mini_docker = sv_base_mini_docker, + runtime_attr_override = runtime_attr_subset_vcf + } } call util.GetSampleIdsFromVcf { input: - vcf = vcf, + vcf = vcfs[0], sv_base_mini_docker = sv_base_mini_docker, runtime_attr_override = runtime_attr_ids_from_vcf } @@ -71,8 +74,8 @@ workflow FilterOutlierSamples { } output { - File outlier_filtered_vcf = SubsetVcfBySamplesList.vcf_subset - File outlier_filtered_vcf_idx = SubsetVcfBySamplesList.vcf_subset_index + Array[File] outlier_filtered_vcfs = SubsetVcfBySamplesList.vcf_subset + Array[File] outlier_filtered_vcf_idxs = SubsetVcfBySamplesList.vcf_subset_index Array[String] filtered_samples_list = FilterSampleList.filtered_samples_list File filtered_samples_file = FilterSampleList.filtered_samples_file Array[String] outlier_samples_excluded = IdentifyOutlierSamples.outlier_samples_list diff --git a/wdl/IdentifyOutlierSamples.wdl b/wdl/IdentifyOutlierSamples.wdl index afbd6d24b..5a002cff3 100644 --- a/wdl/IdentifyOutlierSamples.wdl +++ b/wdl/IdentifyOutlierSamples.wdl @@ -3,11 +3,14 @@ version 1.0 import "Structs.wdl" import "PlotSVCountsPerSample.wdl" as plot_svcounts import "CollectQcVcfWide.wdl" as qc_utils +import "FilterOutlierSamplesPostMinGQ.wdl" as legacy + +# Identifies a list of SV count outlier samples from one or more input VCFs workflow IdentifyOutlierSamples { input { String name # batch or cohort - File vcf + Array[File] vcfs File? sv_counts # SV counts file from PlotSVCountsPerSample - if not provided, will create Int N_IQR_cutoff File? outlier_cutoff_table @@ -23,34 +26,47 @@ workflow IdentifyOutlierSamples { String prefix = if (defined(vcf_identifier)) then "~{name}_~{vcf_identifier}" else name - # Preprocess VCF with bcftools, if optioned - if (defined(bcftools_preprocessing_options)) { - call qc_utils.PreprocessVcf { - input: - vcf=vcf, - prefix=basename(vcf, ".vcf.gz") + ".preprocessed", - bcftools_preprocessing_options=select_first([bcftools_preprocessing_options]), - sv_base_mini_docker=sv_pipeline_docker, - runtime_attr_override=runtime_override_preprocess_vcf + # Collect SV counts for each VCF in parallel inless sv_counts is provided + if (!defined(sv_counts)) { + + # Process each VCF in parallel + scatter ( vcf in vcfs ) { + # Preprocess VCF with bcftools, if optioned + if (defined(bcftools_preprocessing_options)) { + call qc_utils.PreprocessVcf { + input: + vcf = vcf, + prefix = basename(vcf, ".vcf.gz") + ".preprocessed", + bcftools_preprocessing_options = select_first([bcftools_preprocessing_options]), + sv_base_mini_docker = sv_pipeline_docker, + runtime_attr_override = runtime_override_preprocess_vcf + } + } + File prepped_vcf = select_first([PreprocessVcf.outvcf, vcf]) + + call plot_svcounts.CountSVsPerSamplePerType { + input: + vcf = prepped_vcf, + prefix = prefix, + sv_pipeline_docker = sv_pipeline_docker, + runtime_attr_override = runtime_attr_count_svs + } } - } - File prepped_vcf = select_first([PreprocessVcf.outvcf, vcf]) - if (!defined(sv_counts)) { - call plot_svcounts.CountSVsPerSamplePerType { + # Combine counts across all VCFs + call legacy.CombineCounts as Combine { input: - vcf = prepped_vcf, + svcounts = CountSVsPerSamplePerType.sv_counts, prefix = prefix, - sv_pipeline_docker = sv_pipeline_docker, - runtime_attr_override = runtime_attr_count_svs + sv_pipeline_docker = sv_pipeline_docker } } + File final_counts = select_first([sv_counts, Combine.summed_svcounts]) if (defined(outlier_cutoff_table)) { call IdentifyOutliersByCutoffTable { input: - vcf = prepped_vcf, - svcounts = select_first([sv_counts, CountSVsPerSamplePerType.sv_counts]), + svcounts = final_counts, outlier_cutoff_table = select_first([outlier_cutoff_table]), outfile = "${prefix}_outliers.txt", algorithm = select_first([vcf_identifier]), @@ -60,8 +76,7 @@ workflow IdentifyOutlierSamples { } call IdentifyOutliersByIQR { input: - vcf = prepped_vcf, - svcounts = select_first([sv_counts, CountSVsPerSamplePerType.sv_counts]), + svcounts = final_counts, N_IQR_cutoff = N_IQR_cutoff, outfile = "${prefix}_IQR_outliers.txt", sv_pipeline_docker = sv_pipeline_docker, @@ -80,13 +95,12 @@ workflow IdentifyOutlierSamples { output { File outlier_samples_file = CatOutliers.outliers_file Array[String] outlier_samples_list = CatOutliers.outliers_list - File sv_counts_file = select_first([sv_counts, CountSVsPerSamplePerType.sv_counts]) + File sv_counts_file = final_counts } } task IdentifyOutliersByIQR { input { - File vcf File svcounts Int N_IQR_cutoff String outfile @@ -132,7 +146,6 @@ task IdentifyOutliersByIQR { task IdentifyOutliersByCutoffTable { input { - File vcf File svcounts File outlier_cutoff_table String outfile diff --git a/wdl/PlotSVCountsPerSample.wdl b/wdl/PlotSVCountsPerSample.wdl index 0c9edab16..180ae0a7c 100644 --- a/wdl/PlotSVCountsPerSample.wdl +++ b/wdl/PlotSVCountsPerSample.wdl @@ -3,7 +3,7 @@ version 1.0 import "Structs.wdl" import "CollectQcVcfWide.wdl" as qc_utils -# Workflow to count SVs per sample per type and +# Workflow to count SVs per sample per type workflow PlotSVCountsPerSample { input { String prefix From 6442927c2c03f750bcec4b8aa39b29fcd99618c0 Mon Sep 17 00:00:00 2001 From: Ryan Collins Date: Wed, 8 Feb 2023 11:20:05 -0500 Subject: [PATCH 5/7] Extend outlier exclusion workflow, v1 --- .../sum_svcounts_perSample.R | 19 +-- wdl/FilterOutlierSamples.wdl | 12 +- wdl/IdentifyOutlierSamples.wdl | 119 ++++++++++++++++-- 3 files changed, 130 insertions(+), 20 deletions(-) diff --git a/src/sv-pipeline/scripts/downstream_analysis_and_filtering/sum_svcounts_perSample.R b/src/sv-pipeline/scripts/downstream_analysis_and_filtering/sum_svcounts_perSample.R index dfe02f91a..e49b8f129 100755 --- a/src/sv-pipeline/scripts/downstream_analysis_and_filtering/sum_svcounts_perSample.R +++ b/src/sv-pipeline/scripts/downstream_analysis_and_filtering/sum_svcounts_perSample.R @@ -4,26 +4,29 @@ # in final_outlier_sample_filter.wdl -###Set master parameters & read arguments -options(stringsAsFactors=F,scipen=1000) +###Set parameters & read arguments +options(stringsAsFactors=F, scipen=1000) args <- commandArgs(trailingOnly=TRUE) INFILE <- args[1] OUTFILE <- args[2] ###Read input data & reformat -dat <- read.table(INFILE,header=F) -colnames(dat) <- c("sample","svtype","count","chrom") +dat <- read.table(INFILE, header=F, check.names=F, sep="\t", comment.char="") +if(dat[1, 1] %in% c("sample", "#sample")){ + dat <- dat[-c(1), ] +} +colnames(dat) <- c("sample", "svtype", "count", "chrom")[1:ncol(dat)] samples <- as.character(unique(dat$sample)) svtypes <- as.character(unique(dat$svtype)) ###Get sum of counts per sample per svtype -summed.res <- do.call("rbind", lapply(samples,function(sample){ - return(do.call("rbind", lapply(svtypes,function(svtype){ +summed.res <- do.call("rbind", lapply(samples, function(sample){ + return(do.call("rbind", lapply(svtypes, function(svtype){ return(data.frame("sample"=sample, "svtype"=svtype, - "count"=sum(dat[which(dat$sample==sample & dat$svtype==svtype),]$count,na.rm=T))) + "count"=sum(dat[which(dat$sample==sample & dat$svtype==svtype), ]$count,na.rm=T))) }))) })) ###Write summed results to outfile -write.table(summed.res,OUTFILE,col.names=T,row.names=F,sep="\t",quote=F) +write.table(summed.res, OUTFILE, col.names=T, row.names=F, sep="\t", quote=F) diff --git a/wdl/FilterOutlierSamples.wdl b/wdl/FilterOutlierSamples.wdl index 80d38b6cf..3e0fcfa68 100644 --- a/wdl/FilterOutlierSamples.wdl +++ b/wdl/FilterOutlierSamples.wdl @@ -15,6 +15,8 @@ workflow FilterOutlierSamples { File? outlier_cutoff_table String? vcf_identifier # required (enter algorithm here) if providing outlier_cutoff_table, otherwise used in some file prefixes String? bcftools_preprocessing_options + Boolean plot_counts = false + Array[Pair[String, File]]? sample_subsets # if provided, will identify outliers separately within each subset. Expected format is array of pairs, where pair.left is the subset name and pair.right is a text file with all relevant sample IDs String sv_pipeline_docker String sv_base_mini_docker String linux_docker @@ -22,9 +24,11 @@ workflow FilterOutlierSamples { RuntimeAttr? runtime_attr_identify_outliers RuntimeAttr? runtime_attr_subset_vcf RuntimeAttr? runtime_attr_cat_outliers + RuntimeAttr? runtime_attr_subset_counts RuntimeAttr? runtime_attr_filter_samples RuntimeAttr? runtime_attr_ids_from_vcf RuntimeAttr? runtime_attr_count_svs + RuntimeAttr? runtime_attr_plot_svcounts } call identify_outliers.IdentifyOutlierSamples { @@ -36,12 +40,18 @@ workflow FilterOutlierSamples { outlier_cutoff_table = outlier_cutoff_table, vcf_identifier = vcf_identifier, bcftools_preprocessing_options = bcftools_preprocessing_options, + plot_counts = plot_counts, + sample_subsets = sample_subsets, sv_pipeline_docker = sv_pipeline_docker, + sv_base_mini_docker = sv_base_mini_docker, linux_docker = linux_docker, + runtime_attr_ids_from_vcf = runtime_attr_ids_from_vcf, runtime_override_preprocess_vcf = runtime_override_preprocess_vcf, runtime_attr_identify_outliers = runtime_attr_identify_outliers, runtime_attr_cat_outliers = runtime_attr_cat_outliers, - runtime_attr_count_svs = runtime_attr_count_svs + runtime_attr_subset_counts = runtime_attr_subset_counts, + runtime_attr_count_svs = runtime_attr_count_svs, + runtime_attr_plot_svcounts = runtime_attr_plot_svcounts } scatter ( vcf in vcfs ) { diff --git a/wdl/IdentifyOutlierSamples.wdl b/wdl/IdentifyOutlierSamples.wdl index 5a002cff3..87beb8bdd 100644 --- a/wdl/IdentifyOutlierSamples.wdl +++ b/wdl/IdentifyOutlierSamples.wdl @@ -2,6 +2,7 @@ version 1.0 import "Structs.wdl" import "PlotSVCountsPerSample.wdl" as plot_svcounts +import "Utils.wdl" as util import "CollectQcVcfWide.wdl" as qc_utils import "FilterOutlierSamplesPostMinGQ.wdl" as legacy @@ -16,17 +17,33 @@ workflow IdentifyOutlierSamples { File? outlier_cutoff_table String? vcf_identifier # required (enter algorithm here) if providing outlier_cutoff_table, optional otherwise to add to file prefixes (ie. as a VCF identifier) String? bcftools_preprocessing_options + Boolean plot_counts = false + Array[Pair[String, File]]? sample_subsets # if provided, will identify outliers separately within each subset. Expected format is array of pairs, where pair.left is the subset name and pair.right is a text file with all relevant sample IDs String sv_pipeline_docker + String sv_base_mini_docker String linux_docker + RuntimeAttr? runtime_attr_ids_from_vcf RuntimeAttr? runtime_override_preprocess_vcf RuntimeAttr? runtime_attr_identify_outliers RuntimeAttr? runtime_attr_cat_outliers + RuntimeAttr? runtime_attr_subset_counts RuntimeAttr? runtime_attr_count_svs + RuntimeAttr? runtime_attr_plot_svcounts } String prefix = if (defined(vcf_identifier)) then "~{name}_~{vcf_identifier}" else name - # Collect SV counts for each VCF in parallel inless sv_counts is provided + if (!defined(sample_subsets)) { + call util.GetSampleIdsFromVcf as GetSamplesList { + input: + vcf = vcfs[0], + sv_base_mini_docker = sv_base_mini_docker, + runtime_attr_override = runtime_attr_ids_from_vcf + } + } + Array[Pair[String, File]] subsets_to_eval = select_first([sample_subsets, [("ALL", select_first([GetSamplesList.out_file]))]]) + + # Collect SV counts for each VCF in parallel unless sv_counts is provided if (!defined(sv_counts)) { # Process each VCF in parallel @@ -44,7 +61,7 @@ workflow IdentifyOutlierSamples { } File prepped_vcf = select_first([PreprocessVcf.outvcf, vcf]) - call plot_svcounts.CountSVsPerSamplePerType { + call plot_svcounts.CountSVsPerSamplePerType as CountPerVcf { input: vcf = prepped_vcf, prefix = prefix, @@ -56,13 +73,14 @@ workflow IdentifyOutlierSamples { # Combine counts across all VCFs call legacy.CombineCounts as Combine { input: - svcounts = CountSVsPerSamplePerType.sv_counts, + svcounts = CountPerVcf.sv_counts, prefix = prefix, sv_pipeline_docker = sv_pipeline_docker } } File final_counts = select_first([sv_counts, Combine.summed_svcounts]) + # If a precomputed outlier table is provided, directly apply those cutoffs if (defined(outlier_cutoff_table)) { call IdentifyOutliersByCutoffTable { input: @@ -74,19 +92,53 @@ workflow IdentifyOutlierSamples { runtime_attr_override = runtime_attr_identify_outliers } } - call IdentifyOutliersByIQR { - input: - svcounts = final_counts, - N_IQR_cutoff = N_IQR_cutoff, - outfile = "${prefix}_IQR_outliers.txt", - sv_pipeline_docker = sv_pipeline_docker, - runtime_attr_override = runtime_attr_identify_outliers + # Otherwise, dynamically determine outliers based on N_IQR + if (!defined(outlier_cutoff_table)) { + + # Determine outliers for each sample subset, if optioned (e.g., PCR+ vs. PCR-) + scatter ( subset_info in subsets_to_eval ) { + call SubsetCounts { + input: + svcounts = final_counts, + samples_list = subset_info.right, + outfile = "${prefix}.${subset_info.left}.counts.tsv", + linux_docker = linux_docker, + runtime_attr_override = runtime_attr_subset_counts + } + call IdentifyOutliersByIQR { + input: + svcounts = SubsetCounts.counts_subset, + N_IQR_cutoff = N_IQR_cutoff, + outfile = "${prefix}.${subset_info.left}.IQR_outliers.txt", + sv_pipeline_docker = sv_pipeline_docker, + runtime_attr_override = runtime_attr_identify_outliers + } + if (plot_counts) { + call plot_svcounts.PlotSVCountsWithCutoff as PlotCountsPerSubset { + input: + svcounts = SubsetCounts.counts_subset, + n_iqr_cutoff = N_IQR_cutoff, + prefix = "${prefix}.${subset_info.left}", + sv_pipeline_docker = sv_pipeline_docker, + runtime_attr_override = runtime_attr_plot_svcounts + } + } + } + + # Merge outlier lists across all subsets + call CatOutliers as CatSubsets { + input: + outliers = select_all(IdentifyOutliersByIQR.outliers_list), + batch = prefix, + linux_docker = linux_docker, + runtime_attr_override = runtime_attr_cat_outliers + } } # Merge list of outliers call CatOutliers { input: - outliers = select_all([IdentifyOutliersByIQR.outliers_list,IdentifyOutliersByCutoffTable.outliers_list]), + outliers = select_all([CatSubsets.outliers_file, IdentifyOutliersByCutoffTable.outliers_list]), batch = prefix, linux_docker = linux_docker, runtime_attr_override = runtime_attr_cat_outliers @@ -96,6 +148,7 @@ workflow IdentifyOutlierSamples { File outlier_samples_file = CatOutliers.outliers_file Array[String] outlier_samples_list = CatOutliers.outliers_list File sv_counts_file = final_counts + Array[File?]? outlier_plots = PlotCountsPerSubset.svcount_distrib_plots } } @@ -191,6 +244,50 @@ task IdentifyOutliersByCutoffTable { } +# Restrict a file of SV counts per sample to a subset of samples +task SubsetCounts { + input { + File svcounts + File samples_list + String outfile + String linux_docker + RuntimeAttr? runtime_attr_override + } + + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 3.75, + disk_gb: 10, + boot_disk_gb: 10, + preemptible_tries: 3, + max_retries: 1 + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + + output { + File counts_subset = "${outfile}" + } + + command <<< + + set -euo pipefail + head -n1 ~{svcounts} > ~{outfile} + fgrep -wf ~{samples_list} ~{svcounts} >> ~{outfile} + + >>> + + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + docker: linux_docker + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + } + +} + # Merge outlier sample lists across algorithms task CatOutliers { From c9b332fa6a37f53f5a01782ec8a1e8a1e16369d0 Mon Sep 17 00:00:00 2001 From: Ryan Collins Date: Thu, 9 Feb 2023 11:04:59 -0500 Subject: [PATCH 6/7] Fix header issues when combining SV counts from multiple VCFs --- inputs/values/dockers.json | 2 +- .../sum_svcounts_perSample.R | 23 +++++++++++-------- wdl/FilterOutlierSamples.wdl | 10 +++++++- wdl/IdentifyOutlierSamples.wdl | 6 +++-- 4 files changed, 27 insertions(+), 14 deletions(-) diff --git a/inputs/values/dockers.json b/inputs/values/dockers.json index 8448074b6..8d20e0d3e 100644 --- a/inputs/values/dockers.json +++ b/inputs/values/dockers.json @@ -33,4 +33,4 @@ "sv_utils_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-utils:2023-02-01-v0.26.8-beta-9b25c72d", "gq_recalibrator_docker": "us.gcr.io/broad-dsde-methods/tbrookin/gatk:0a7e1d86f", "str": "us.gcr.io/broad-dsde-methods/vjalili/str:5994670" -} \ No newline at end of file +} diff --git a/src/sv-pipeline/scripts/downstream_analysis_and_filtering/sum_svcounts_perSample.R b/src/sv-pipeline/scripts/downstream_analysis_and_filtering/sum_svcounts_perSample.R index e49b8f129..8989a6b2d 100755 --- a/src/sv-pipeline/scripts/downstream_analysis_and_filtering/sum_svcounts_perSample.R +++ b/src/sv-pipeline/scripts/downstream_analysis_and_filtering/sum_svcounts_perSample.R @@ -4,29 +4,32 @@ # in final_outlier_sample_filter.wdl -###Set parameters & read arguments +### Set parameters & read arguments options(stringsAsFactors=F, scipen=1000) args <- commandArgs(trailingOnly=TRUE) INFILE <- args[1] OUTFILE <- args[2] -###Read input data & reformat +### Read input data & reformat dat <- read.table(INFILE, header=F, check.names=F, sep="\t", comment.char="") -if(dat[1, 1] %in% c("sample", "#sample")){ - dat <- dat[-c(1), ] +drop.rows <- which(dat[, 1] %in% c("sample", "#sample")) +if(length(drop.rows) > 0){ + dat <- dat[-drop.rows, ] } colnames(dat) <- c("sample", "svtype", "count", "chrom")[1:ncol(dat)] +dat$count <- as.numeric(dat$count) samples <- as.character(unique(dat$sample)) svtypes <- as.character(unique(dat$svtype)) -###Get sum of counts per sample per svtype +### Get sum of counts per sample per svtype summed.res <- do.call("rbind", lapply(samples, function(sample){ - return(do.call("rbind", lapply(svtypes, function(svtype){ - return(data.frame("sample"=sample, + do.call("rbind", lapply(svtypes, function(svtype){ + data.frame("sample"=sample, "svtype"=svtype, - "count"=sum(dat[which(dat$sample==sample & dat$svtype==svtype), ]$count,na.rm=T))) - }))) + "count"=sum(dat[which(dat$sample==sample & dat$svtype==svtype), ]$count, + na.rm=T)) + })) })) -###Write summed results to outfile +### Write summed results to outfile write.table(summed.res, OUTFILE, col.names=T, row.names=F, sep="\t", quote=F) diff --git a/wdl/FilterOutlierSamples.wdl b/wdl/FilterOutlierSamples.wdl index 3e0fcfa68..93be211e6 100644 --- a/wdl/FilterOutlierSamples.wdl +++ b/wdl/FilterOutlierSamples.wdl @@ -16,7 +16,8 @@ workflow FilterOutlierSamples { String? vcf_identifier # required (enter algorithm here) if providing outlier_cutoff_table, otherwise used in some file prefixes String? bcftools_preprocessing_options Boolean plot_counts = false - Array[Pair[String, File]]? sample_subsets # if provided, will identify outliers separately within each subset. Expected format is array of pairs, where pair.left is the subset name and pair.right is a text file with all relevant sample IDs + Array[String]? sample_subset_prefixes # if provided, will identify outliers separately within each subset + Array[String]? sample_subset_lists # if provided, will identify outliers separately within each subset String sv_pipeline_docker String sv_base_mini_docker String linux_docker @@ -28,9 +29,15 @@ workflow FilterOutlierSamples { RuntimeAttr? runtime_attr_filter_samples RuntimeAttr? runtime_attr_ids_from_vcf RuntimeAttr? runtime_attr_count_svs + RuntimeAttr? runtime_attr_combine_counts RuntimeAttr? runtime_attr_plot_svcounts } + if (defined(sample_subset_prefixes) && defined(sample_subset_lists)) { + Array[Pair[String, File]]? sample_subsets = zip(select_first([sample_subset_prefixes]), + select_first([sample_subset_lists])) + } + call identify_outliers.IdentifyOutlierSamples { input: vcfs = vcfs, @@ -51,6 +58,7 @@ workflow FilterOutlierSamples { runtime_attr_cat_outliers = runtime_attr_cat_outliers, runtime_attr_subset_counts = runtime_attr_subset_counts, runtime_attr_count_svs = runtime_attr_count_svs, + runtime_attr_combine_counts = runtime_attr_combine_counts, runtime_attr_plot_svcounts = runtime_attr_plot_svcounts } diff --git a/wdl/IdentifyOutlierSamples.wdl b/wdl/IdentifyOutlierSamples.wdl index 87beb8bdd..83cbe10d5 100644 --- a/wdl/IdentifyOutlierSamples.wdl +++ b/wdl/IdentifyOutlierSamples.wdl @@ -28,6 +28,7 @@ workflow IdentifyOutlierSamples { RuntimeAttr? runtime_attr_cat_outliers RuntimeAttr? runtime_attr_subset_counts RuntimeAttr? runtime_attr_count_svs + RuntimeAttr? runtime_attr_combine_counts RuntimeAttr? runtime_attr_plot_svcounts } @@ -41,7 +42,7 @@ workflow IdentifyOutlierSamples { runtime_attr_override = runtime_attr_ids_from_vcf } } - Array[Pair[String, File]] subsets_to_eval = select_first([sample_subsets, [("ALL", select_first([GetSamplesList.out_file]))]]) + Array[Pair[String, File]] subsets_to_eval = select_first([sample_subsets, [("ALL", select_all([GetSamplesList.out_file]))]]) # Collect SV counts for each VCF in parallel unless sv_counts is provided if (!defined(sv_counts)) { @@ -75,7 +76,8 @@ workflow IdentifyOutlierSamples { input: svcounts = CountPerVcf.sv_counts, prefix = prefix, - sv_pipeline_docker = sv_pipeline_docker + sv_pipeline_docker = sv_pipeline_docker, + runtime_attr_override = runtime_attr_combine_counts } } File final_counts = select_first([sv_counts, Combine.summed_svcounts]) From cdca40801c8ef625983db442b5b788b11b05851d Mon Sep 17 00:00:00 2001 From: Ryan Collins Date: Sat, 11 Feb 2023 07:54:32 -0500 Subject: [PATCH 7/7] Final update before PR --- wdl/FilterOutlierSamples.wdl | 2 + wdl/IdentifyOutlierSamples.wdl | 110 +++++++++++++++++++++++++++------ 2 files changed, 93 insertions(+), 19 deletions(-) diff --git a/wdl/FilterOutlierSamples.wdl b/wdl/FilterOutlierSamples.wdl index 93be211e6..3c54fce5d 100644 --- a/wdl/FilterOutlierSamples.wdl +++ b/wdl/FilterOutlierSamples.wdl @@ -18,6 +18,7 @@ workflow FilterOutlierSamples { Boolean plot_counts = false Array[String]? sample_subset_prefixes # if provided, will identify outliers separately within each subset Array[String]? sample_subset_lists # if provided, will identify outliers separately within each subset + Int samples_per_shard = 5000 String sv_pipeline_docker String sv_base_mini_docker String linux_docker @@ -49,6 +50,7 @@ workflow FilterOutlierSamples { bcftools_preprocessing_options = bcftools_preprocessing_options, plot_counts = plot_counts, sample_subsets = sample_subsets, + samples_per_shard = samples_per_shard, sv_pipeline_docker = sv_pipeline_docker, sv_base_mini_docker = sv_base_mini_docker, linux_docker = linux_docker, diff --git a/wdl/IdentifyOutlierSamples.wdl b/wdl/IdentifyOutlierSamples.wdl index 83cbe10d5..273670023 100644 --- a/wdl/IdentifyOutlierSamples.wdl +++ b/wdl/IdentifyOutlierSamples.wdl @@ -3,6 +3,7 @@ version 1.0 import "Structs.wdl" import "PlotSVCountsPerSample.wdl" as plot_svcounts import "Utils.wdl" as util +import "TasksMakeCohortVcf.wdl" as cohort_utils import "CollectQcVcfWide.wdl" as qc_utils import "FilterOutlierSamplesPostMinGQ.wdl" as legacy @@ -19,6 +20,7 @@ workflow IdentifyOutlierSamples { String? bcftools_preprocessing_options Boolean plot_counts = false Array[Pair[String, File]]? sample_subsets # if provided, will identify outliers separately within each subset. Expected format is array of pairs, where pair.left is the subset name and pair.right is a text file with all relevant sample IDs + Int samples_per_shard = 5000 String sv_pipeline_docker String sv_base_mini_docker String linux_docker @@ -34,15 +36,14 @@ workflow IdentifyOutlierSamples { String prefix = if (defined(vcf_identifier)) then "~{name}_~{vcf_identifier}" else name - if (!defined(sample_subsets)) { - call util.GetSampleIdsFromVcf as GetSamplesList { - input: - vcf = vcfs[0], - sv_base_mini_docker = sv_base_mini_docker, - runtime_attr_override = runtime_attr_ids_from_vcf - } + call util.GetSampleIdsFromVcf as GetSamplesList { + input: + vcf = vcfs[0], + sv_base_mini_docker = sv_base_mini_docker, + runtime_attr_override = runtime_attr_ids_from_vcf } - Array[Pair[String, File]] subsets_to_eval = select_first([sample_subsets, [("ALL", select_all([GetSamplesList.out_file]))]]) + Array[Pair[String, File]] default_subsets = [("ALL", GetSamplesList.out_file)] + Array[Pair[String, File]] subsets_to_eval = select_first([sample_subsets, default_subsets]) # Collect SV counts for each VCF in parallel unless sv_counts is provided if (!defined(sv_counts)) { @@ -71,16 +72,41 @@ workflow IdentifyOutlierSamples { } } - # Combine counts across all VCFs - call legacy.CombineCounts as Combine { + # Combine counts across all VCFs (scattered over sample chunks) + call cohort_utils.SplitUncompressed as ShardSamples { input: - svcounts = CountPerVcf.sv_counts, - prefix = prefix, - sv_pipeline_docker = sv_pipeline_docker, - runtime_attr_override = runtime_attr_combine_counts + whole_file = GetSamplesList.out_file, + lines_per_shard = samples_per_shard, + shard_prefix = prefix, + shuffle_file = true, + random_seed = 2023, + sv_pipeline_docker = sv_pipeline_docker + } + scatter ( sample_shard in ShardSamples.shards ) { + call SubsetCounts as SubsetPreCombine { + input: + svcounts = CountPerVcf.sv_counts, + samples_list = sample_shard, + outfile = "${prefix}.shard.counts.tsv", + linux_docker = linux_docker, + runtime_attr_override = runtime_attr_subset_counts + } + call legacy.CombineCounts as CombineShard { + input: + svcounts = [SubsetPreCombine.counts_subset], + prefix = prefix, + sv_pipeline_docker = sv_pipeline_docker, + runtime_attr_override = runtime_attr_combine_counts + } + } + call CatCounts as Combine { + input: + svcounts = CombineShard.summed_svcounts, + outfile = "${prefix}.merged.counts.tsv", + linux_docker = linux_docker } } - File final_counts = select_first([sv_counts, Combine.summed_svcounts]) + File final_counts = select_first([sv_counts, Combine.merged_counts]) # If a precomputed outlier table is provided, directly apply those cutoffs if (defined(outlier_cutoff_table)) { @@ -101,7 +127,7 @@ workflow IdentifyOutlierSamples { scatter ( subset_info in subsets_to_eval ) { call SubsetCounts { input: - svcounts = final_counts, + svcounts = [final_counts], samples_list = subset_info.right, outfile = "${prefix}.${subset_info.left}.counts.tsv", linux_docker = linux_docker, @@ -249,7 +275,7 @@ task IdentifyOutliersByCutoffTable { # Restrict a file of SV counts per sample to a subset of samples task SubsetCounts { input { - File svcounts + Array[File] svcounts File samples_list String outfile String linux_docker @@ -273,8 +299,10 @@ task SubsetCounts { command <<< set -euo pipefail - head -n1 ~{svcounts} > ~{outfile} - fgrep -wf ~{samples_list} ~{svcounts} >> ~{outfile} + head -n1 ~{svcounts[0]} > ~{outfile} + for file in ~{sep=" " svcounts}; do + fgrep -wf ~{samples_list} "$file" >> ~{outfile} + done >>> @@ -290,6 +318,50 @@ task SubsetCounts { } +# Naive concatenation of multiple counts files while accounting for header +task CatCounts { + input { + Array[File] svcounts + String outfile + String linux_docker + RuntimeAttr? runtime_attr_override + } + + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 3.75, + disk_gb: 10, + boot_disk_gb: 10, + preemptible_tries: 3, + max_retries: 1 + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + + output { + File merged_counts = "${outfile}" + } + + command <<< + + set -euo pipefail + head -n1 ~{svcounts[0]} > ~{outfile} + for file in ~{sep=" " svcounts}; do + cat "$file" | sed '1d' >> ~{outfile} + done + + >>> + + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + docker: linux_docker + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + } + +} # Merge outlier sample lists across algorithms task CatOutliers {