sanger-tol · muffato · Jun 5, 2026 · Jun 4, 2026 · Jun 4, 2026 · Jun 5, 2026
diff --git a/.nf-core.yml b/.nf-core.yml
@@ -42,4 +42,4 @@ template:
     - igenomes
     - multiqc
     - fastqc
-  version: 2.0.1
+  version: 2.0.2
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -3,6 +3,12 @@
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [[2.0.2](https://github.com/sanger-tol/variantcalling/releases/tag/2.0.2)] - Qin Shi Huang (patch 2) - [2026-06-05]
+
+### Enhancements & fixes
+
+- Reverted a bug that forced the samplesheet `sample` to have unique entries.
+
 ## [[2.0.1](https://github.com/sanger-tol/variantcalling/releases/tag/2.0.1)] - Qin Shi Huang (patch 1) - [2026-05-15]
 
 ### Enhancements & fixes

diff --git a/CITATION.cff b/CITATION.cff
@@ -40,13 +40,13 @@ authors:
     orcid: https://orcid.org/0000-0001-7029-0785
     website: https://github.com/yz533cb
 cff-version: 1.2.0
-date-released: "2026-05-15"
+date-released: "2026-06-05"
 doi: 10.5281/zenodo.7890527
 license: MIT
 message: If you use this software, please cite it using the metadata from this file
   and all references from CITATIONS.md .
 repository-code: https://github.com/sanger-tol/variantcalling
-title: sanger-tol/variantcalling v2.0.1 - Qin Shi Huang (patch 1) [2026-05-15]
+title: sanger-tol/variantcalling v2.0.2 - Qin Shi Huang (patch 2) [2026-06-05]
 type: software
 url: https://pipelines.tol.sanger.ac.uk/variantcalling
-version: 2.0.1
+version: 2.0.2
diff --git a/assets/samplesheet_test.csv b/assets/samplesheet_test.csv
@@ -1,6 +1,6 @@
 sample,datatype,datafile
-SAMEA7521030/ERR10224927_03_cram_1,pacbio,https://tolit.cog.sanger.ac.uk/test-data/Cantharis_rufa/analysis/icCanRufa1/read_mapping/pacbio/GCA_947369205.1.unmasked.pacbio.icCanRufa1_0_3.cram
-SAMEA7521030/ERR10224927_03_bam_1,pacbio,https://tolit.cog.sanger.ac.uk/test-data/Cantharis_rufa/analysis/icCanRufa1/read_mapping/pacbio/GCA_947369205.1.unmasked.pacbio.icCanRufa1_0_3.bam
-SAMEA7521030_2/ERR10224927_03_cram_2,pacbio,https://tolit.cog.sanger.ac.uk/test-data/Cantharis_rufa/analysis/icCanRufa1/read_mapping/pacbio/GCA_947369205.1.unmasked.pacbio.icCanRufa1_0_3.cram
-SAMEA7521030_2/ERR10224927_03_bam_2,pacbio,https://tolit.cog.sanger.ac.uk/test-data/Cantharis_rufa/analysis/icCanRufa1/read_mapping/pacbio/GCA_947369205.1.unmasked.pacbio.icCanRufa1_0_3.bam
+SAMEA7521030/ERR10224927_03_cram,pacbio,https://tolit.cog.sanger.ac.uk/test-data/Cantharis_rufa/analysis/icCanRufa1/read_mapping/pacbio/GCA_947369205.1.unmasked.pacbio.icCanRufa1_0_3.cram
+SAMEA7521030/ERR10224927_03_bam,pacbio,https://tolit.cog.sanger.ac.uk/test-data/Cantharis_rufa/analysis/icCanRufa1/read_mapping/pacbio/GCA_947369205.1.unmasked.pacbio.icCanRufa1_0_3.bam
+SAMEA7521030_2,pacbio,https://tolit.cog.sanger.ac.uk/test-data/Cantharis_rufa/analysis/icCanRufa1/read_mapping/pacbio/GCA_947369205.1.unmasked.pacbio.icCanRufa1_0_3.cram
+SAMEA7521030_2,pacbio,https://tolit.cog.sanger.ac.uk/test-data/Cantharis_rufa/analysis/icCanRufa1/read_mapping/pacbio/GCA_947369205.1.unmasked.pacbio.icCanRufa1_0_3.bam
 SAMEA7521030_X,pacbio,https://tolit.cog.sanger.ac.uk/test-data/Cantharis_rufa/analysis/icCanRufa1/read_mapping/pacbio/GCA_947369205.1.unmasked.pacbio.icCanRufa1_0_3.bam
diff --git a/assets/samplesheet_test_align.csv b/assets/samplesheet_test_align.csv
@@ -1,4 +1,6 @@
 sample,datatype,datafile
 SAMEA7521030/ERR10224927_03,pacbio,https://tolit.cog.sanger.ac.uk/test-data/Cantharis_rufa/genomic_data/icCanRufa1/pacbio/m64094_200730_174533.ccs.bc1010_BAK8A_OA--bc1010_BAK8A_OA_0_03.bam
 SAMEA7521030/ERR10224927_02_1,pacbio,https://tolit.cog.sanger.ac.uk/test-data/Cantharis_rufa/genomic_data/icCanRufa1/pacbio/m64094_200730_174533.ccs.bc1010_BAK8A_OA--bc1010_BAK8A_OA_0_02.bam
+SAMEA7521030_2,pacbio,https://tolit.cog.sanger.ac.uk/test-data/Cantharis_rufa/genomic_data/icCanRufa1/pacbio/m64094_200730_174533.ccs.bc1010_BAK8A_OA--bc1010_BAK8A_OA_0_03.bam
+SAMEA7521030_2,pacbio,https://tolit.cog.sanger.ac.uk/test-data/Cantharis_rufa/genomic_data/icCanRufa1/pacbio/m64094_200730_174533.ccs.bc1010_BAK8A_OA--bc1010_BAK8A_OA_0_02.bam
 SAMEA7521030_X/ERR10224927_02_2,pacbio,https://tolit.cog.sanger.ac.uk/test-data/Cantharis_rufa/genomic_data/icCanRufa1/pacbio/m64094_200730_174533.ccs.bc1010_BAK8A_OA--bc1010_BAK8A_OA_0_02.bam
diff --git a/assets/schema_input.json b/assets/schema_input.json
@@ -29,6 +29,5 @@
             }
         },
         "required": ["sample", "datatype", "datafile"]
-    },
-    "uniqueEntries": ["sample"]
+    }
 }
diff --git a/docs/usage.md b/docs/usage.md
@@ -16,7 +16,10 @@ You will need to create a samplesheet with information about the samples you wou
 
 ### Multiple runs of the same sample
 
-The `sample` identifiers have to be unique. When a sample (`specimen`) is re-sequenced more than once e.g. to increase sequencing depth, this field is highly recommended to be in form of `specimen/run`, where `specimen` is the same and `run` is unique across these entries. Below is an example for the same sample sequenced across 3 lanes:
+When a specimen is re-sequenced more than once e.g. to increase sequencing depth,
+the `sample` identifier is highly recommended to be in form of `specimen/run`,
+where `specimen` is the same and `run` is unique across these entries.
+Below is an example for the same sample sequenced across 3 lanes:
 
 ```csv title="samplesheet.csv"
 sample,datatype,datafile
@@ -25,6 +28,15 @@ specimen1/run2,pacbio,sample1_2.cram
 specimen1/run3,pacbio,sample1_3.cram
 ```
 
+However the following simplified samplesheet works too:
+
+```csv title="samplesheet.csv"
+sample,datatype,datafile
+specimen1,pacbio,sample1_1.cram
+specimen1,pacbio,sample1_2.cram
+specimen1,pacbio,sample1_3.cram
+```
+
 ### Full samplesheet
 
 A final samplesheet file consisting of both BAM or CRAM will look like this. Currently this pipeline only supports Pacbio aligned data.
@@ -36,11 +48,11 @@ specimen2/run2,pacbio,/path/to/data/file/file2.cram
 specimen3/run3,pacbio,/path/to/data/file/file3.bam
 ```
 
-| Column     | Description                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       |
-| ---------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `sample`   | Custom sample identifier. Must be unique across all entries. Spaces are automatically converted to underscores (`_`), so using underscores in the input is recommended for clarity. For specimens sequenced multiple times, use the format `specimen/run` where `specimen` is the same and `run` is unique for each run (e.g., `SAMEA7521030/ERR10224927`). The forward slash (`/`) is converted to a dot (`.`) in downstream processes. The `sample` field is also used to organize output analysis directories. |
-| `datatype` | Sequencing data type. Must be `pacbio`.                                                                                                                                                                                                                                                                                                                                                                                                                                                                           |
-| `datafile` | The location for either BAM or CRAM file.                                                                                                                                                                                                                                                                                                                                                                                                                                                                         |
+| Column     | Description                                                                                                                                                                                                                                                                                                                                                                                                                                                                    |
+| ---------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| `sample`   | Custom sample identifier. Spaces are automatically converted to underscores (`_`), so using underscores in the input is recommended for clarity. For specimens sequenced multiple times, use the format `specimen/run` where `specimen` is the same and `run` is unique for each run (e.g., `SAMEA7521030/ERR10224927`). The forward slash (`/`) is converted to a dot (`.`) in downstream processes. The `sample` field is also used to organize output analysis directories. |
+| `datatype` | Sequencing data type. Must be `pacbio`.                                                                                                                                                                                                                                                                                                                                                                                                                                        |
+| `datafile` | The location for either BAM or CRAM file.                                                                                                                                                                                                                                                                                                                                                                                                                                      |
 
 An [example samplesheet](../assets/samplesheet.csv) has been provided with the pipeline.
 

diff --git a/nextflow.config b/nextflow.config
@@ -305,7 +305,7 @@ manifest {
     mainScript      = 'main.nf'
     defaultBranch   = 'main'
     nextflowVersion = '!>=25.04.2'
-    version         = '2.0.1'
+    version         = '2.0.2'
     doi             = '10.5281/zenodo.7890527'
 }
 

diff --git a/ro-crate-metadata.json b/ro-crate-metadata.json
@@ -159,10 +159,10 @@
             },
             "url": [
                 "https://github.com/sanger-tol/variantcalling",
-                "https://pipelines.tol.sanger.ac.uk/variantcalling/2.0.1/"
+                "https://pipelines.tol.sanger.ac.uk/variantcalling/2.0.2/"
             ],
             "version": [
-                "2.0.1"
+                "2.0.2"
             ]
         },
         {

diff --git a/subworkflows/local/align_pacbio.nf b/subworkflows/local/align_pacbio.nf
@@ -48,7 +48,7 @@ workflow ALIGN_PACBIO {
     ch_bams_to_merge = ch_bams.to_merge
         .map { _meta, orig_id_reads ->
             def meta_read = orig_id_reads[0][0]
-            def runs = orig_id_reads.collect { id_read -> id_read[0].run }
+            def runs = orig_id_reads.collect { id_read -> id_read[0].run ?: id_read[0].basename }
             def meta_read_new = meta_read + ['sample': "${meta_read.specimen}/${params.merge_output}", 'id': "${meta_read.fasta_id}.${meta_read.datatype}.${meta_read.specimen}.${params.merge_output}", 'run': "merge", 'merge_source': runs.sort().join("\n")]
             def new_reads = orig_id_reads
                 .sort { a, b -> a[0].id <=> b[0].id} // sort by id to ensure consistent order

diff --git a/subworkflows/local/input_merge.nf b/subworkflows/local/input_merge.nf
@@ -30,12 +30,12 @@ workflow INPUT_MERGE {
     ch_reads_to_merge = grouped_reads_meta.to_merge
         .map { _meta, orig_id_reads ->
             def meta_read = orig_id_reads[0][0]
-            def runs = orig_id_reads.collect { id_read -> id_read[0].run }
+            def runs = orig_id_reads.collect { id_read -> id_read[0].run ?: id_read[0].basename }
             def meta_read_new = meta_read + ['sample': "${meta_read.specimen}/${params.merge_output}",
                                             'id': "${meta_read.fasta_id}.${meta_read.datatype}.${meta_read.specimen}.${params.merge_output}",
                                             'run': "merge",
                                             'merge_source': runs.sort().join("\n"),
-                                            'basename': meta_read.basename.replaceAll(meta_read.run, params.merge_output) ]
+                                            'basename': meta_read.run ? meta_read.basename.replaceAll(meta_read.run, params.merge_output) : meta_read.basename ]
             def new_reads = orig_id_reads
                 .sort { a, b -> a[0].id <=> b[0].id} // sort by id to ensure consistent order
                 .collect { id_read -> id_read[1] }

diff --git a/subworkflows/local/utils_nfcore_variantcalling_pipeline/main.nf b/subworkflows/local/utils_nfcore_variantcalling_pipeline/main.nf
@@ -181,7 +181,6 @@ workflow PIPELINE_COMPLETION {
 
 def validateInputSamplesheet(channel) {
     def validFormats = [".cram", ".bam"]
-    def seen_ids = [:]  // Track seen IDs for duplicate detection
 
     return channel.map { row ->
         def (meta, datafile) = row
@@ -215,12 +214,6 @@ def validateInputSamplesheet(channel) {
         meta.basename = datafile.baseName
         meta.read_group = "\'@RG\\tID:" + datafile.simpleName + "\\tPL:" + platform + "\\tSM:" + meta.specimen + "\'"
 
-        // INLINE DUPLICATE CHECK - happens immediately
-        if (seen_ids.containsKey(meta.id)) {
-            error("Sample cannot be duplicated (slash (`/`) and dot (`.`) treated as equivalent): ${meta.id}")
-        }
-        seen_ids[meta.id] = true
-
         return [meta, datafile]
     }
 }
-Original file line number
+Diff line change
@@ Expand Up / @@ -29,6 +29,5 @@ @@
                 }
             },
             "required": ["sample", "datatype", "datafile"]
-        },
-        "uniqueEntries": ["sample"]
+        }
     }