IntGenomicsLab · robert-a-forsyth · Mar 13, 2026 · Mar 12, 2026 · Mar 12, 2026 · Mar 12, 2026
diff --git a/CITATIONS.md b/CITATIONS.md
@@ -8,6 +8,10 @@
 
 > Di Tommaso P, Chatzou M, Floden EW, Barja PP, Palumbo E, Notredame C. Nextflow enables reproducible computational workflows. Nat Biotechnol. 2017 Apr 11;35(4):316-319. doi: 10.1038/nbt.3820. PubMed PMID: 28398311.
 
+## [LRSomatic](https://doi.org/10.64898/2026.02.26.707772)
+
+> LRSomatic: a highly scalable and robust pipeline for somatic variant calling in long-read sequencing data Robert A. Forsyth*, Luuk Harbers*, Amber Verhasselt, Ana-Lucía Rocha Iraizós, Sidi Yang, Joris Vande Velde, Christopher Davies, Nischalan Pillay, Laurens Lambrechts, Jonas Demeulemeester bioRxiv 2026.02.26.707772; doi: https://doi.org/10.64898/2026.02.26.707772
+
 ## Pipeline tools
 
 - [ASCAT](https://pubmed.ncbi.nlm.nih.gov/20837533/)

diff --git a/README.md b/README.md
@@ -162,7 +162,13 @@ If you would like to contribute to this pipeline, please see the [contributing g
 
 ## Citations
 
-If you use IntGenomicsLab/lrsomatic for your analysis, please cite it using the following doi: [10.5281/zenodo.17751829](https://doi.org/10.5281/zenodo.17751829)
+If you use `IntGenomicsLab/lrsomatic` for your analysis, please cite it using the following:
+
+> LRSomatic: a highly scalable and robust pipeline for somatic variant calling in long-read sequencing data
+>
+> Robert A. Forsyth*, Luuk Harbers*, Amber Verhasselt, Ana-Lucía Rocha Iraizós, Sidi Yang, Joris Vande Velde, Christopher Davies, Nischalan Pillay, Laurens Lambrechts, Jonas Demeulemeester
+>
+> bioRxiv 2026.02.26.707772; doi: https://doi.org/10.64898/2026.02.26.707772
 
 An extensive list of references for the tools used by the pipeline can be found in the [`CITATIONS.md`](CITATIONS.md) file.
 

diff --git a/docs/usage.md b/docs/usage.md
@@ -97,6 +97,34 @@ genome: 'GRCh37'
 
 You can also generate such `YAML`/`JSON` files via [nf-core/launch](https://nf-co.re/launch).
 
+## CHM13 Support
+
+Our pipeline fully supports CHM13 and most reference and annotation files are automatically downloaded when specifying `--genome CHM13`.
+
+However, VEP will need a bit of additional setup. The VEP cache for CHM13 needs to be manually downloaded. This can be done using the following code. Feel free to change any of the paths, ensuring that the correct path is pointed to in the pipeline parameters.
+
+Download CHM13 Cache:
+
+```bash
+cd $HOME/.vep
+curl -O https://ftp.ensembl.org/pub/rapid-release/species/Homo_sapiens/GCA_009914755.4/ensembl/variation/2022_10/indexed_vep_cache/Homo_sapiens-GCA_009914755.4-2022_10.tar.gz
+tar xzf Homo_sapiens-GCA_009914755.4-2022_10.tar.gz
+```
+
+Then you can run the pipeline as follows:
+
+```bash
+nextflow run IntGenomicsLab/lrsomatic \
+  --input samplesheet.csv \
+  --outdir ./results \
+  --genome CHM13 \
+  --vep_cache $HOME/.vep \
+  --vep_cache_version 107 \
+  -profile docker
+```
+
+If you want to run with a CHM13 reference without using `--genome CHM13` (for example, via a custom FASTA or configuration), you must also specify `--vep_genome T2T-CHM13v2.0` and `--vep_species homo_sapiens_gca009914755v4`.
+
 ### Pipeline options
 
 | Parameter        | Description                                                                                                                                                                  |

diff --git a/subworkflows/local/prepare_reference_files.nf b/subworkflows/local/prepare_reference_files.nf
@@ -38,6 +38,7 @@ workflow PREPARE_REFERENCE_FILES {
         } else {
             ch_prepared_fasta = [ [:], fasta ]
         }
+        // ch_prepared_fasta: [[:], fasta_path]  -- empty meta; uncompressed if input was .gz
 
         // if clair3 model is specified, then download that
         // otherwise use info in bam header and download that
@@ -52,6 +53,7 @@ workflow PREPARE_REFERENCE_FILES {
         }
         .unique()
         .set{ clair3_model_urls }
+        // [meta(id=clair3_model_id), download_url]  -- one item per unique Clair3 model; deduplicated with .unique()
 
         //
         // MODULE: Download model
@@ -70,6 +72,7 @@ workflow PREPARE_REFERENCE_FILES {
         )
 
         UNTAR.out.untar.set { downloaded_clair3_models }
+        // [meta(id=clair3_model_id), model_dir]  -- extracted Clair3 model directory
 
         //
         // MODULE: Index the fasta
@@ -80,6 +83,7 @@ workflow PREPARE_REFERENCE_FILES {
         )
 
         ch_prepared_fai = SAMTOOLS_FAIDX.out.fai
+        // ch_prepared_fai: [[:], fai_path]  -- empty meta
 
         //
         // Prepare ASCAT files
@@ -117,14 +121,16 @@ workflow PREPARE_REFERENCE_FILES {
         }
 
     emit:
-        prepped_fasta = ch_prepared_fasta
-        prepped_fai = ch_prepared_fai
+        prepped_fasta = ch_prepared_fasta  // [[:], fasta_path]
+        prepped_fai = ch_prepared_fai      // [[:], fai_path]
 
+        // ASCAT reference files -- flat file collections (no meta tuple wrapper)
         allele_files
         loci_files
         gc_file
         rt_file
-        downloaded_clair3_models
+
+        downloaded_clair3_models  // [meta(id=clair3_model_id), model_dir]
 
         versions = ch_versions
 }
diff --git a/subworkflows/local/tumor_normal_happhase.nf b/subworkflows/local/tumor_normal_happhase.nf
@@ -52,6 +52,7 @@ workflow TUMOR_NORMAL_HAPPHASE {
             return [ new_meta, meta.clair3_model, bam, bai ]
         }
         .set { normal_bams_model }
+    // [meta, clair3_model_id, bam, bai]  -- keyed by model ID for .combine() with downloaded_clair3_models
 
     normal_bams_model
         .combine(downloaded_clair3_models,by:1)
@@ -60,6 +61,7 @@ workflow TUMOR_NORMAL_HAPPHASE {
             return [meta_bam, bam, bai, model, platform]
         }
         .set{ normal_bams }
+    // [meta, bam, bai, clair3_model_dir, platform]  -- type excluded from meta; platform is "hifi" for PacBio
 
     /*
     .map{ basecall_model, meta, bam, bai, meta2, model ->
@@ -68,13 +70,6 @@ workflow TUMOR_NORMAL_HAPPHASE {
         }
     */
 
-    // normal_bams -> meta:         [id, paired_data, platform, sex, fiber, basecall_model]
-    //                bam:          list of concatenated aligned bams
-    //                bai:          indexes for bam files
-    //                clair3_model: clair3 model name
-    //                platform:     name of sequencing platform
-
-
     // Get tumour bams
     // remove type from so that information can be merged easier later
     mixed_bams.tumor
@@ -91,10 +86,7 @@ workflow TUMOR_NORMAL_HAPPHASE {
             return[new_meta, bam, bai]
         }
         .set{ tumor_bams }
-
-    // tumor_bams -> meta:  [id, paired_data, platform, sex, fiber, basecall_model]
-    //                bam:  list of concatenated aligned bams
-    //                bai:  indexes for bam files
+    // [meta, bam, bai]  -- type excluded from meta for downstream groupTuple merge
 
     //
     // MODULE: CLAIR3
@@ -117,20 +109,15 @@ workflow TUMOR_NORMAL_HAPPHASE {
             return [meta, bam, bai, vcf, svs, mods]
         }
         .set{ normal_bams_germlinevcf }
-
-    // normal_bams -> meta: [id, paired_data, platform, sex, type, fiber, basecall_model]
-    //                bam:  list of concatenated aligned bams
-    //                bai:  indexes for bam files
-    //                vcf:  normal small germline variant vcf
-    //                svs:  structural variant vcf (empty)
-    //                mods: modcall-generated VCF with modifications (empty)
+    // [meta, bam, bai, germline_vcf, [], []]  -- svs and mods are empty placeholders for LONGPHASE_PHASE input
 
     CLAIR3.out.vcf
         .map { meta, vcf ->
             def extra = []
             return [meta, vcf, extra]
         }
         .set { germline_vep }
+    // [meta, clair3_vcf, []]  -- germline small variants for VEP annotation
 
     //
     // MODULE: LONGPHASE_PHASE
@@ -158,14 +145,7 @@ workflow TUMOR_NORMAL_HAPPHASE {
             return[new_meta, bam, bai, vcf, svs, mods]
         }
         .set{ normal_bams }
-
-    // normal_bams -> meta: [id, paired_data, platform, sex, type, fiber, basecall_model]
-    //                bam:  list of concatenated aligned bams
-    //                bai:  indexes for bam files
-    //                vcf:  normal small germline variant vcf
-    //                svs:  structural variant vcf (empty)
-    //                mods: modcall-generated VCF with modifications (empty)
-
+    // [meta+{type:"normal"}, bam, bai, phased_vcf, [], []]  -- type re-added; svs and mods are empty placeholders for LONGPHASE_HAPLOTAG
 
     // Add phased vcf to tumour bams and type information
     // mix with the normal bams
@@ -180,13 +160,7 @@ workflow TUMOR_NORMAL_HAPPHASE {
         }
         .mix(normal_bams)
         .set{ mixed_bams_vcf }
-
-    // mixed_bams_vcf -> meta: [id, paired_data, platform, sex, type, fiber, basecall_model]
-    //                   bam:  list of concatenated aligned bams
-    //                   bai:  indexes for bam files
-    //                   vcf:  normal small germline variant vcf
-    //                   svs:  structural variant vcf (empty)
-    //                   mods: modcall-generated VCF with modifications (empty)
+    // [meta+{type}, bam, bai, phased_normal_vcf, [], []]  -- tumor and normal items both carry the same phased normal VCF
 
     //
     // MODULE: LONGPHASE_HAPLOTAG
@@ -204,10 +178,7 @@ workflow TUMOR_NORMAL_HAPPHASE {
     // Get final tagged bams
     LONGPHASE_HAPLOTAG.out.bam
         .set{ mixed_hapbams }
-
-    // mixed_hapbams -> meta: [id, paired_data, platform, sex, type, fiber, basecall_model]
-    //                  bams: haplotagged aligned bams
-
+    // [meta+{type}, haplotagged_bam]
 
     //
     // MODULE: SAMTOOLS_INDEX
@@ -223,9 +194,7 @@ workflow TUMOR_NORMAL_HAPPHASE {
         .join(mixed_hapbams)
         .join(SAMTOOLS_INDEX.out.bai)
         .set{ mixed_hapbams }
-    // mixed_hapbams -> meta: [id, paired_data, platform, sex, type, fiber, basecall_model]
-    //                  bams: haplotagged aligned bams
-    //                  bais: indexes for bam files
+    // [meta+{type}, orig_bam, orig_bai, vcf, svs, mods, hapbam, hapbai]
 
     // Group everything back together in one channel
     mixed_hapbams
@@ -253,20 +222,16 @@ workflow TUMOR_NORMAL_HAPPHASE {
         .join(LONGPHASE_PHASE.out.snv_vcf)
         .join(LONGPHASE_PHASE.out.snv_vcf_index)
         .set{tumor_normal_severus}
-    // tumor_normal_severus -> meta:                [id, paired_data, platform, sex, fiber, basecall_model]
-    //                         tumor_bam:           haplotagged aligned bam for tumor
-    //                         tumor_bai:           indexes for tumor bam files
-    //                         normal_bam:          haplotagged aligned bam files for normal
-    //                         normal_bai:          indexes for normal bam files
-    //                         phased_vcf:          phased small variant vcf for normal
-    //                         phased_vcf_index:    phased small variant vcf index for normal
+    // [meta, tumor_hapbam, tumor_bai, normal_hapbam, normal_bai, phased_vcf, phased_tbi]
 
     // Get ClairS input channel
     tumor_normal_severus
         .map { meta, tumor_bam, tumor_bai, normal_bam, normal_bai, _vcf, _tbi ->
             return[meta , tumor_bam, tumor_bai, normal_bam, normal_bai, meta.clairS_model]
         }
         .set { clairs_input }
+    // [meta, tumor_bam, tumor_bai, normal_bam, normal_bai, clairS_model]
+
     //
     // MODULE: CLAIRS
     //
@@ -303,6 +268,7 @@ workflow TUMOR_NORMAL_HAPPHASE {
             return [meta, vcf, extra]
         }
         .set { somatic_vep }
+    // [meta, sorted_clairs_vcf, []]  -- somatic small variants (SNV+indel merged) for VEP annotation
 
     emit:
     tumor_normal_severus

diff --git a/subworkflows/local/tumor_only_happhase.nf b/subworkflows/local/tumor_only_happhase.nf
@@ -27,6 +27,7 @@ workflow TUMOR_ONLY_HAPPHASE {
             return [meta, bam, bai, meta.clairSTO_model]
         }
         .set{ tumor_bams }
+    // [meta, bam, bai, clairSTO_model]  -- ClairS-TO model string appended for CLAIRSTO input
 
     //
     // MODULE: CLAIRSTO
@@ -47,10 +48,7 @@ workflow TUMOR_ONLY_HAPPHASE {
     CLAIRSTO.out.indel_vcf
                 .join(CLAIRSTO.out.snv_vcf)
                 .set{ clairsto_vcf }
-
-    // clairsto_vcf -> meta:      [id, paired_data, platform, sex, type, fiber, basecall_model]
-    //                 indel_vcf: vcf for indels
-    //                 snv_vcf:   vcf for snvs
+    // [meta, indel_vcf, snv_vcf]  -- raw ClairS-TO variant calls
 
     //
     // MODULE: VCFSPLIT
@@ -72,28 +70,23 @@ workflow TUMOR_ONLY_HAPPHASE {
             return[meta, bam, bai, snps, svs, mods]
         }
         .set{ tumor_bams_germlinevcf }
-
+    // [meta, bam, bai, nonsomatic_vcf, [], []]  -- non-somatic variants used for phasing; svs and mods are empty placeholders for LONGPHASE_PHASE input
 
     VCFSPLIT.out.somatic_vcf
         .map { meta, vcf ->
             def extra = []
             return [meta,vcf, extra]
         }
         .set { somatic_vep }
+    // [meta, somatic_vcf, []]  -- PASS (somatic) variants for VEP annotation
 
     VCFSPLIT.out.germline_vcf
         .map { meta, vcf ->
             def extra = []
             return [meta,vcf, extra]
         }
         .set { germline_vep }
-
-    // tumor_bams_germlinevcf -> meta: [id, paired_data, platform, sex, type, fiber, basecall_model]
-    //                           bam:  list of concatenated aligned bams
-    //                           bai:  indexes for bam files
-    //                           vcf:  tumor small nonsomatic variant vcf
-    //                           svs:  structural variant vcf (empty)
-    //                           mods: modcall-generated VCF with modifications (empty)
+    // [meta, germline_vcf, []]  -- non-somatic variants (relabelled PASS) for VEP annotation
 
     //
     // MODULES: LONGPHASE_PHASE
@@ -118,13 +111,7 @@ workflow TUMOR_ONLY_HAPPHASE {
             return [new_meta, bam, bai, vcf, svs, mods]
         }
         .set{ tumor_bams_phasedvcf }
-
-    // tumor_bams_germlinevcf -> meta: [id, paired_data, platform, sex, type, fiber, basecall_model]
-    //                           bam:  list of concatenated aligned bams
-    //                           bai:  indexes for bam files
-    //                           vcf:  phased tumor small nonsomatic variant vcf
-    //                           svs:  structural variant vcf (empty)
-    //                           mods: modcall-generated VCF with modifications (empty)
+    // [meta+{type:"tumor"}, bam, bai, phased_nonsomatic_vcf, [], []]  -- type added; svs and mods are empty placeholders for LONGPHASE_HAPLOTAG
 
     //
     // MODULES: LONGPHASE_HAPLOTAG
@@ -142,9 +129,7 @@ workflow TUMOR_ONLY_HAPPHASE {
     // grab phased bams
     LONGPHASE_HAPLOTAG.out.bam
         .set{ haplotagged_bams }
-
-    // haplotagged_bams -> meta: [id, paired_data, platform, sex, type, fiber, basecall_model]
-    //                     bams: list of concatenated aligned bams
+    // [meta+{type:"tumor"}, haplotagged_bam]
 
     //
     // MODULES: SAMTOOLS_INDEX
@@ -172,14 +157,7 @@ workflow TUMOR_ONLY_HAPPHASE {
             return [new_meta, hap_bam, hap_bai, [], [], vcf, tbi]
             }
         .set{ tumor_only_severus }
-
-    // tumor_only_severus ->   meta:     [id, paired_data, platform, sex, fiber, basecall_model]
-    //                         hap_bam:  haplotagged aligned bam for tumor
-    //                         hap_bai:  indexes for tumor bam files
-    //                         normal_bam: haplotagged aligned bam files for normal (empty)
-    //                         normal_bai: indexes for normal bam files (empty)
-    //                         phased_vcf: phased small variant vcf
-    //                         tbi: index for phased small variant vcf
+    // [meta, hap_bam, hap_bai, [], [], phased_vcf, phased_tbi]  -- normal_bam and normal_bai are [] (tumor-only mode)
 
     emit:
     tumor_only_severus