From b24103105b74dead8ce1f079901609d43c861ceb Mon Sep 17 00:00:00 2001
From: albatalavera <albatalavera92@gmail.com>
Date: Fri, 13 Mar 2026 13:18:43 +0100
Subject: [PATCH 1/2] Add nf-core/gtdbtk classifywf module

---
 .../nf-core/gtdbtk/classifywf/environment.yml |   8 +
 modules/nf-core/gtdbtk/classifywf/main.nf     |  80 ++++
 modules/nf-core/gtdbtk/classifywf/meta.yml    | 226 +++++++++++
 .../gtdbtk/classifywf/tests/main.nf.test      | 104 ++++++
 .../gtdbtk/classifywf/tests/main.nf.test.snap | 350 ++++++++++++++++++
 .../gtdbtk/classifywf/tests/nextflow.config   |   5 +
 6 files changed, 773 insertions(+)
 create mode 100644 modules/nf-core/gtdbtk/classifywf/environment.yml
 create mode 100644 modules/nf-core/gtdbtk/classifywf/main.nf
 create mode 100644 modules/nf-core/gtdbtk/classifywf/meta.yml
 create mode 100644 modules/nf-core/gtdbtk/classifywf/tests/main.nf.test
 create mode 100644 modules/nf-core/gtdbtk/classifywf/tests/main.nf.test.snap
 create mode 100644 modules/nf-core/gtdbtk/classifywf/tests/nextflow.config

diff --git a/modules/nf-core/gtdbtk/classifywf/environment.yml b/modules/nf-core/gtdbtk/classifywf/environment.yml
new file mode 100644
index 00000000..73c3f9c5
--- /dev/null
+++ b/modules/nf-core/gtdbtk/classifywf/environment.yml
@@ -0,0 +1,8 @@
+---
+# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json
+channels:
+  - conda-forge
+  - bioconda
+dependencies:
+  - bioconda::gtdbtk=2.6.1
+  - conda-forge::python=3.13.12
diff --git a/modules/nf-core/gtdbtk/classifywf/main.nf b/modules/nf-core/gtdbtk/classifywf/main.nf
new file mode 100644
index 00000000..e7398405
--- /dev/null
+++ b/modules/nf-core/gtdbtk/classifywf/main.nf
@@ -0,0 +1,80 @@
+process GTDBTK_CLASSIFYWF {
+    tag "${meta.id}"
+    label 'process_high_memory'
+
+    conda "${moduleDir}/environment.yml"
+    container "${workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container
+        ? 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/c2/c2df03eec9c0805810e0ef6caec4347d7c6545eece61e941018945502fafc9b6/data'
+        : 'community.wave.seqera.io/library/gtdbtk_python:cee0379cf1ca2968'}"
+
+    input:
+    tuple val(meta)   , path("bins/*")
+    tuple val(db_name), path(db)
+    val use_pplacer_scratch_dir
+
+    output:
+    tuple val(meta), path("${prefix}")                               , emit: gtdb_outdir
+    tuple val(meta), path("${prefix}/classify/*.summary.tsv")        , emit: summary
+    tuple val(meta), path("${prefix}/classify/*.classify.tree")      , emit: tree       , optional: true
+    tuple val(meta), path("${prefix}/identify/*.markers_summary.tsv"), emit: markers    , optional: true
+    tuple val(meta), path("${prefix}/align/*.msa.fasta.gz")          , emit: msa        , optional: true
+    tuple val(meta), path("${prefix}/align/*.user_msa.fasta.gz")     , emit: user_msa   , optional: true
+    tuple val(meta), path("${prefix}/align/*.filtered.tsv")          , emit: filtered   , optional: true
+    tuple val(meta), path("${prefix}/identify/*.failed_genomes.tsv") , emit: failed     , optional: true
+    tuple val(meta), path("${prefix}/${prefix}.log")                 , emit: log
+    tuple val(meta), path("${prefix}/${prefix}.warnings.log")        , emit: warnings
+    tuple val("${task.process}"), val('gtdbtk'), eval("gtdbtk --version 2>&1 | grep -Eo '[0-9]+(\\.[0-9]+)+' | head -1") , topic: versions, emit: versions_gtdbtk
+    tuple val("${task.process}"), val('gtdb_db'), eval('grep VERSION_DATA $GTDBTK_DATA_PATH/metadata/metadata.txt | sed "s/VERSION_DATA=//"'), topic: versions, emit: versions_gtdbtk_db
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    def args            = task.ext.args ?: ''
+    prefix              = task.ext.prefix ?: "${meta.id}"
+    def pplacer_scratch = use_pplacer_scratch_dir ? "--scratch_dir pplacer_tmp" : ""
+    """
+    export GTDBTK_DATA_PATH="\$(find -L ${db} -name 'metadata' -type d -exec dirname {} \\;)"
+
+    if [ "${pplacer_scratch}" != "" ] ; then
+        mkdir pplacer_tmp
+    fi
+
+    gtdbtk classify_wf \\
+        ${args} \\
+        --genome_dir bins \\
+        --prefix "${prefix}" \\
+        --out_dir ${prefix} \\
+        --cpus ${task.cpus} \\
+        ${pplacer_scratch}
+
+    mv ${prefix}/gtdbtk.log "${prefix}/${prefix}.log"
+    mv ${prefix}/gtdbtk.warnings.log "${prefix}/${prefix}.warnings.log"
+    """
+
+    stub:
+    prefix = task.ext.prefix ?: "${meta.id}"
+    """
+    mkdir ${prefix}
+    mkdir ${prefix}/identify
+    mkdir ${prefix}/classify
+    mkdir ${prefix}/align
+
+    touch ${prefix}/classify/${prefix}.ar53.summary.tsv
+    touch ${prefix}/classify/${prefix}.bac120.summary.tsv
+    touch ${prefix}/classify/${prefix}.ar53.classify.tree
+    touch ${prefix}/classify/${prefix}.bac120.classify.tree
+
+    touch ${prefix}/identify/${prefix}.ar53.markers_summary.tsv
+    touch ${prefix}/identify/${prefix}.bac120.markers_summary.tsv
+
+    echo "" | gzip > ${prefix}/align/${prefix}.ar53.msa.fasta.gz
+    echo "" | gzip > ${prefix}/align/${prefix}.bac120.user_msa.fasta.gz
+    touch ${prefix}/align/${prefix}.ar53.filtered.tsv
+    touch ${prefix}/align/${prefix}.bac120.filtered.tsv
+
+    touch ${prefix}/${prefix}.log
+    touch ${prefix}/${prefix}.warnings.log
+    touch ${prefix}/${prefix}.failed_genomes.tsv
+    """
+}
diff --git a/modules/nf-core/gtdbtk/classifywf/meta.yml b/modules/nf-core/gtdbtk/classifywf/meta.yml
new file mode 100644
index 00000000..1c583c32
--- /dev/null
+++ b/modules/nf-core/gtdbtk/classifywf/meta.yml
@@ -0,0 +1,226 @@
+name: gtdbtk_classifywf
+description: GTDB-Tk is a software toolkit for assigning objective taxonomic classifications
+  to bacterial and archaeal genomes based on the Genome Database Taxonomy GTDB.
+keywords:
+  - GTDB taxonomy
+  - taxonomic classification
+  - metagenomics
+  - classification
+  - genome taxonomy database
+  - bacteria
+  - archaea
+tools:
+  - gtdbtk:
+      description: GTDB-Tk is a software toolkit for assigning objective taxonomic classifications
+        to bacterial and archaeal genomes based on the Genome Database Taxonomy GTDB.
+      homepage: https://ecogenomics.github.io/GTDBTk/
+      documentation: https://ecogenomics.github.io/GTDBTk/
+      tool_dev_url: https://github.com/Ecogenomics/GTDBTk
+      doi: "10.1093/bioinformatics/btz848"
+      licence: ["GNU General Public v3 (GPL v3)"]
+      identifier: ""
+input:
+  - - meta:
+        type: map
+        description: |
+          Groovy Map containing sample information
+          e.g. [ id:'test', single_end:false,  assembler:'spades' ]
+    - bins/*:
+        type: file
+        description: A list of one or more bins in FASTA format for classification
+        pattern: "*.{fasta,fna,fas,fa}{,.gz}"
+        ontologies:
+          - edam: http://edamontology.org/format_1929 # FASTA
+  - - db_name:
+        type: string
+        description: The name of the GTDB database to use.
+    - db:
+        type: file
+        description: |
+          Path to a directory containing a GDTB database, as uncompressed from from the 'full package' gtdbdtk_data.tar.gz file.
+          You can give the 'release<version number>' directory here.
+          Must contain the 'metadata' subdirectory
+        pattern: "release[0-9]+/"
+        ontologies: []
+  - use_pplacer_scratch_dir:
+      type: boolean
+      description: Set to true to reduce pplacer memory usage by writing to disk (slower)
+output:
+  gtdb_outdir:
+    - - meta:
+          type: map
+          description: |
+            Groovy Map containing sample information
+            e.g. [ id:'test', single_end:false,  assembler:'spades' ]
+          pattern: "*"
+      - ${prefix}:
+          type: directory
+          description: All files output by GTDB-Tk
+          pattern: "*"
+  summary:
+    - - meta:
+          type: map
+          description: |
+            Groovy Map containing sample information
+            e.g. [ id:'test', single_end:false,  assembler:'spades' ]
+          pattern: "*"
+      - ${prefix}/classify/*.summary.tsv:
+          type: file
+          description: A TSV summary file for the classification
+          pattern: "*.{summary.tsv}"
+          ontologies:
+            - edam: http://edamontology.org/format_3475 # TSV
+  tree:
+    - - meta:
+          type: map
+          description: |
+            Groovy Map containing sample information
+            e.g. [ id:'test', single_end:false,  assembler:'spades' ]
+          pattern: "*"
+      - ${prefix}/classify/*.classify.tree:
+          type: file
+          description: |
+            Groovy Map NJ or UPGMA trees in Newick format produced from a multiple sequence
+            alignment
+          pattern: "*.{classify.tree}"
+          ontologies:
+            - edam: http://edamontology.org/format_1910 # newick
+  markers:
+    - - meta:
+          type: map
+          description: |
+            Groovy Map containing sample information
+            e.g. [ id:'test', single_end:false,  assembler:'spades' ]
+          pattern: "*"
+      - ${prefix}/identify/*.markers_summary.tsv:
+          type: file
+          description: A TSV summary file lineage markers used for the classification.
+          pattern: "*.{markers_summary.tsv}"
+          ontologies:
+            - edam: http://edamontology.org/format_3475 # TSV
+  msa:
+    - - meta:
+          type: map
+          description: |
+            Groovy Map containing sample information
+            e.g. [ id:'test', single_end:false,  assembler:'spades' ]
+          pattern: "*"
+      - ${prefix}/align/*.msa.fasta.gz:
+          type: file
+          description: Multiple sequence alignments file.
+          pattern: "*.{msa.fasta.gz}"
+          ontologies:
+            - edam: http://edamontology.org/format_1929 # FASTA
+  user_msa:
+    - - meta:
+          type: map
+          description: |
+            Groovy Map containing sample information
+            e.g. [ id:'test', single_end:false,  assembler:'spades' ]
+          pattern: "*"
+      - ${prefix}/align/*.user_msa.fasta.gz:
+          type: file
+          description: Multiple sequence alignments file for the user-provided files.
+          pattern: "*.{user_msa.fasta.gz}"
+          ontologies:
+            - edam: http://edamontology.org/format_1929 # FASTA
+  filtered:
+    - - meta:
+          type: map
+          description: |
+            Groovy Map containing sample information
+            e.g. [ id:'test', single_end:false,  assembler:'spades' ]
+          pattern: "*"
+      - ${prefix}/align/*.filtered.tsv:
+          type: file
+          description: A list of genomes with an insufficient number of amino acids
+            in MSA
+          pattern: "*.{filtered.tsv}"
+          ontologies:
+            - edam: http://edamontology.org/format_3475 # TSV
+  failed:
+    - - meta:
+          type: map
+          description: |
+            Groovy Map containing sample information
+            e.g. [ id:'test', single_end:false,  assembler:'spades' ]
+          pattern: "*"
+      - ${prefix}/identify/*.failed_genomes.tsv:
+          type: file
+          description: A TSV summary of the genomes which GTDB-tk failed to classify.
+          pattern: "*.{failed_genomes.tsv}"
+          ontologies:
+            - edam: http://edamontology.org/format_3475 # TSV
+  log:
+    - - meta:
+          type: map
+          description: |
+            Groovy Map containing sample information
+            e.g. [ id:'test', single_end:false,  assembler:'spades' ]
+          pattern: "*"
+      - ${prefix}/${prefix}.log:
+          type: file
+          description: GTDB-tk log file
+          pattern: "*.{log}"
+          ontologies:
+            - edam: http://edamontology.org/data_3671 # Text
+  warnings:
+    - - meta:
+          type: map
+          description: |
+            Groovy Map containing sample information
+            e.g. [ id:'test', single_end:false,  assembler:'spades' ]
+          pattern: "*"
+      - ${prefix}/${prefix}.warnings.log:
+          type: file
+          description: GTDB-tk warnings log file
+          pattern: "*.{warnings.log}"
+          ontologies:
+            - edam: http://edamontology.org/data_3671 # Text
+  versions_gtdbtk:
+    - - ${task.process}:
+          type: string
+          description: The name of the process
+      - gtdbtk:
+          type: string
+          description: The name of the tool
+      - "gtdbtk --version 2>&1 | grep -Eo '[0-9]+(\\.[0-9]+)+' | head -1":
+          type: eval
+          description: The expression to obtain the version of the tool
+  versions_gtdbtk_db:
+    - - ${task.process}:
+          type: string
+          description: The name of the process
+      - gtdb_db:
+          type: string
+          description: The name of the database
+      - 'grep VERSION_DATA $GTDBTK_DATA_PATH/metadata/metadata.txt | sed "s/VERSION_DATA=//"':
+          type: eval
+          description: The expression to obtain the version of the database
+topics:
+  versions:
+    - - ${task.process}:
+          type: string
+          description: The name of the process
+      - gtdbtk:
+          type: string
+          description: The name of the tool
+      - "gtdbtk --version 2>&1 | grep -Eo '[0-9]+(\\.[0-9]+)+' | head -1":
+          type: eval
+          description: The expression to obtain the version of the tool
+    - - ${task.process}:
+          type: string
+          description: The name of the process
+      - gtdb_db:
+          type: string
+          description: The name of the database
+      - 'grep VERSION_DATA $GTDBTK_DATA_PATH/metadata/metadata.txt | sed "s/VERSION_DATA=//"':
+          type: eval
+          description: The expression to obtain the version of the database
+authors:
+  - "@skrakau"
+  - "@prototaxites"
+  - "@abhi18av"
+maintainers:
+  - "@skrakau"
+  - "@abhi18av"
diff --git a/modules/nf-core/gtdbtk/classifywf/tests/main.nf.test b/modules/nf-core/gtdbtk/classifywf/tests/main.nf.test
new file mode 100644
index 00000000..41f0aba3
--- /dev/null
+++ b/modules/nf-core/gtdbtk/classifywf/tests/main.nf.test
@@ -0,0 +1,104 @@
+nextflow_process {
+
+    name "Test Process GTDBTK_CLASSIFYWF"
+    script "../main.nf"
+    process "GTDBTK_CLASSIFYWF"
+    config './nextflow.config'
+
+    tag "modules"
+    tag "modules_nfcore"
+    tag "gtdbtk"
+    tag "gtdbtk/classifywf"
+    tag "untar"
+
+    setup {
+        run("UNTAR") {
+            script "../../../untar/main.nf"
+            process {
+            """
+            input[0] = [
+                        [ id:'mockup' ], // meta map
+                        file('https://github.com/nf-core/test-datasets/raw/refs/heads/mag/databases/gtdbtk/gtdbtk_mockup_20250422.tar.gz', checkIfExists: true)
+            ]
+            """
+            }
+        }
+    }
+
+    // Using special mini test-data provided to use via GTDB developers
+    test("E. coli - genome fasta") {
+
+        when {
+
+            params {
+                // Recommended by the GTDB developers: https://data.gtdb.ecogenomic.org/releases/latest/auxillary_files/gtdbtk_package/mockup_db/HOWTO.txt
+                // The `--skip_ani_screen` requires comparison against a full database and goes through the full classfywf pipeline
+                module_args = '--extension fa --skip_ani_screen'
+            }
+
+            process {
+                """
+                input[0] = [
+                        [ id:'test', single_end:false, assembler:'SPADES' ],
+                        [
+                            file(params.modules_testdata_base_path + 'genomics/prokaryotes/escherichia_coli/genome/genome.fa', checkIfExists: true),
+                        ]
+                    ]
+                input[1] = UNTAR.out.untar
+                input[2] = false
+                """
+            }
+        }
+
+        then {
+            assertAll(
+                { assert process.success },
+                { assert snapshot(
+                        process.out.summary,
+                        process.out.tree,
+                        process.out.markers,
+                        process.out.msa,
+                        process.out.user_msa,
+                        process.out.filtered,
+                        file(process.out.log[0][1]).readLines().contains('INFO: Done.'),
+                        process.out.versions_gtdbtk,
+                        process.out.versions_gtdbtk_db,
+                    ).match() }
+            )
+        }
+    }
+
+    test("sarscov2 - genome fasta - stub") {
+
+        options "-stub"
+
+        when {
+
+            params {
+                // Recommended by the GTDB developers: https://data.gtdb.ecogenomic.org/releases/latest/auxillary_files/gtdbtk_package/mockup_db/HOWTO.txt
+                // The `--skip_ani_screen` requires comparison against a full database and goes through the full classfywf pipeline
+                module_args = '--extension fa --skip_ani_screen'
+            }
+
+            process {
+                """
+                input[0] = [
+                        [ id:'test', single_end:false, assembler:'SPADES' ],
+                        [
+                            file(params.modules_testdata_base_path + 'genomics/prokaryotes/escherichia_coli/genome/genome.fa', checkIfExists: true),
+                        ]
+                    ]
+                input[1] = [[:],[]]
+                input[2] = false
+                """
+            }
+        }
+
+        then {
+            assertAll(
+                { assert process.success },
+                { assert snapshot(process.out).match() }
+            )
+        }
+    }
+}
diff --git a/modules/nf-core/gtdbtk/classifywf/tests/main.nf.test.snap b/modules/nf-core/gtdbtk/classifywf/tests/main.nf.test.snap
new file mode 100644
index 00000000..6d902757
--- /dev/null
+++ b/modules/nf-core/gtdbtk/classifywf/tests/main.nf.test.snap
@@ -0,0 +1,350 @@
+{
+    "E. coli - genome fasta": {
+        "content": [
+            [
+                [
+                    {
+                        "id": "test",
+                        "single_end": false,
+                        "assembler": "SPADES"
+                    },
+                    "test.bac120.summary.tsv:md5,6d78949595cbfb145108cc5374dbac23"
+                ]
+            ],
+            [
+                
+            ],
+            [
+                [
+                    {
+                        "id": "test",
+                        "single_end": false,
+                        "assembler": "SPADES"
+                    },
+                    [
+                        "test.ar53.markers_summary.tsv:md5,3ba12aa91791ee263df1bee8558413eb",
+                        "test.bac120.markers_summary.tsv:md5,49c710e91fe35aff2ee5bf6b1b949ed4"
+                    ]
+                ]
+            ],
+            [
+                
+            ],
+            [
+                
+            ],
+            [
+                
+            ],
+            false,
+            [
+                [
+                    "GTDBTK_CLASSIFYWF",
+                    "gtdbtk",
+                    "2.6.1"
+                ]
+            ],
+            [
+                [
+                    "GTDBTK_CLASSIFYWF",
+                    "gtdb_db",
+                    "r226"
+                ]
+            ]
+        ],
+        "timestamp": "2026-03-06T18:32:07.523214921",
+        "meta": {
+            "nf-test": "0.9.4",
+            "nextflow": "25.10.4"
+        }
+    },
+    "sarscov2 - genome fasta - stub": {
+        "content": [
+            {
+                "0": [
+                    [
+                        {
+                            "id": "test",
+                            "single_end": false,
+                            "assembler": "SPADES"
+                        },
+                        [
+                            [
+                                "test.ar53.filtered.tsv:md5,d41d8cd98f00b204e9800998ecf8427e",
+                                "test.ar53.msa.fasta.gz:md5,68b329da9893e34099c7d8ad5cb9c940",
+                                "test.bac120.filtered.tsv:md5,d41d8cd98f00b204e9800998ecf8427e",
+                                "test.bac120.user_msa.fasta.gz:md5,68b329da9893e34099c7d8ad5cb9c940"
+                            ],
+                            [
+                                "test.ar53.classify.tree:md5,d41d8cd98f00b204e9800998ecf8427e",
+                                "test.ar53.summary.tsv:md5,d41d8cd98f00b204e9800998ecf8427e",
+                                "test.bac120.classify.tree:md5,d41d8cd98f00b204e9800998ecf8427e",
+                                "test.bac120.summary.tsv:md5,d41d8cd98f00b204e9800998ecf8427e"
+                            ],
+                            [
+                                "test.ar53.markers_summary.tsv:md5,d41d8cd98f00b204e9800998ecf8427e",
+                                "test.bac120.markers_summary.tsv:md5,d41d8cd98f00b204e9800998ecf8427e"
+                            ],
+                            "test.failed_genomes.tsv:md5,d41d8cd98f00b204e9800998ecf8427e",
+                            "test.log:md5,d41d8cd98f00b204e9800998ecf8427e",
+                            "test.warnings.log:md5,d41d8cd98f00b204e9800998ecf8427e"
+                        ]
+                    ]
+                ],
+                "1": [
+                    [
+                        {
+                            "id": "test",
+                            "single_end": false,
+                            "assembler": "SPADES"
+                        },
+                        [
+                            "test.ar53.summary.tsv:md5,d41d8cd98f00b204e9800998ecf8427e",
+                            "test.bac120.summary.tsv:md5,d41d8cd98f00b204e9800998ecf8427e"
+                        ]
+                    ]
+                ],
+                "10": [
+                    [
+                        "GTDBTK_CLASSIFYWF",
+                        "gtdbtk",
+                        "2.6.1"
+                    ]
+                ],
+                "11": [
+                    [
+                        "GTDBTK_CLASSIFYWF",
+                        "gtdb_db",
+                        ""
+                    ]
+                ],
+                "2": [
+                    [
+                        {
+                            "id": "test",
+                            "single_end": false,
+                            "assembler": "SPADES"
+                        },
+                        [
+                            "test.ar53.classify.tree:md5,d41d8cd98f00b204e9800998ecf8427e",
+                            "test.bac120.classify.tree:md5,d41d8cd98f00b204e9800998ecf8427e"
+                        ]
+                    ]
+                ],
+                "3": [
+                    [
+                        {
+                            "id": "test",
+                            "single_end": false,
+                            "assembler": "SPADES"
+                        },
+                        [
+                            "test.ar53.markers_summary.tsv:md5,d41d8cd98f00b204e9800998ecf8427e",
+                            "test.bac120.markers_summary.tsv:md5,d41d8cd98f00b204e9800998ecf8427e"
+                        ]
+                    ]
+                ],
+                "4": [
+                    [
+                        {
+                            "id": "test",
+                            "single_end": false,
+                            "assembler": "SPADES"
+                        },
+                        "test.ar53.msa.fasta.gz:md5,68b329da9893e34099c7d8ad5cb9c940"
+                    ]
+                ],
+                "5": [
+                    [
+                        {
+                            "id": "test",
+                            "single_end": false,
+                            "assembler": "SPADES"
+                        },
+                        "test.bac120.user_msa.fasta.gz:md5,68b329da9893e34099c7d8ad5cb9c940"
+                    ]
+                ],
+                "6": [
+                    [
+                        {
+                            "id": "test",
+                            "single_end": false,
+                            "assembler": "SPADES"
+                        },
+                        [
+                            "test.ar53.filtered.tsv:md5,d41d8cd98f00b204e9800998ecf8427e",
+                            "test.bac120.filtered.tsv:md5,d41d8cd98f00b204e9800998ecf8427e"
+                        ]
+                    ]
+                ],
+                "7": [
+                    
+                ],
+                "8": [
+                    [
+                        {
+                            "id": "test",
+                            "single_end": false,
+                            "assembler": "SPADES"
+                        },
+                        "test.log:md5,d41d8cd98f00b204e9800998ecf8427e"
+                    ]
+                ],
+                "9": [
+                    [
+                        {
+                            "id": "test",
+                            "single_end": false,
+                            "assembler": "SPADES"
+                        },
+                        "test.warnings.log:md5,d41d8cd98f00b204e9800998ecf8427e"
+                    ]
+                ],
+                "failed": [
+                    
+                ],
+                "filtered": [
+                    [
+                        {
+                            "id": "test",
+                            "single_end": false,
+                            "assembler": "SPADES"
+                        },
+                        [
+                            "test.ar53.filtered.tsv:md5,d41d8cd98f00b204e9800998ecf8427e",
+                            "test.bac120.filtered.tsv:md5,d41d8cd98f00b204e9800998ecf8427e"
+                        ]
+                    ]
+                ],
+                "gtdb_outdir": [
+                    [
+                        {
+                            "id": "test",
+                            "single_end": false,
+                            "assembler": "SPADES"
+                        },
+                        [
+                            [
+                                "test.ar53.filtered.tsv:md5,d41d8cd98f00b204e9800998ecf8427e",
+                                "test.ar53.msa.fasta.gz:md5,68b329da9893e34099c7d8ad5cb9c940",
+                                "test.bac120.filtered.tsv:md5,d41d8cd98f00b204e9800998ecf8427e",
+                                "test.bac120.user_msa.fasta.gz:md5,68b329da9893e34099c7d8ad5cb9c940"
+                            ],
+                            [
+                                "test.ar53.classify.tree:md5,d41d8cd98f00b204e9800998ecf8427e",
+                                "test.ar53.summary.tsv:md5,d41d8cd98f00b204e9800998ecf8427e",
+                                "test.bac120.classify.tree:md5,d41d8cd98f00b204e9800998ecf8427e",
+                                "test.bac120.summary.tsv:md5,d41d8cd98f00b204e9800998ecf8427e"
+                            ],
+                            [
+                                "test.ar53.markers_summary.tsv:md5,d41d8cd98f00b204e9800998ecf8427e",
+                                "test.bac120.markers_summary.tsv:md5,d41d8cd98f00b204e9800998ecf8427e"
+                            ],
+                            "test.failed_genomes.tsv:md5,d41d8cd98f00b204e9800998ecf8427e",
+                            "test.log:md5,d41d8cd98f00b204e9800998ecf8427e",
+                            "test.warnings.log:md5,d41d8cd98f00b204e9800998ecf8427e"
+                        ]
+                    ]
+                ],
+                "log": [
+                    [
+                        {
+                            "id": "test",
+                            "single_end": false,
+                            "assembler": "SPADES"
+                        },
+                        "test.log:md5,d41d8cd98f00b204e9800998ecf8427e"
+                    ]
+                ],
+                "markers": [
+                    [
+                        {
+                            "id": "test",
+                            "single_end": false,
+                            "assembler": "SPADES"
+                        },
+                        [
+                            "test.ar53.markers_summary.tsv:md5,d41d8cd98f00b204e9800998ecf8427e",
+                            "test.bac120.markers_summary.tsv:md5,d41d8cd98f00b204e9800998ecf8427e"
+                        ]
+                    ]
+                ],
+                "msa": [
+                    [
+                        {
+                            "id": "test",
+                            "single_end": false,
+                            "assembler": "SPADES"
+                        },
+                        "test.ar53.msa.fasta.gz:md5,68b329da9893e34099c7d8ad5cb9c940"
+                    ]
+                ],
+                "summary": [
+                    [
+                        {
+                            "id": "test",
+                            "single_end": false,
+                            "assembler": "SPADES"
+                        },
+                        [
+                            "test.ar53.summary.tsv:md5,d41d8cd98f00b204e9800998ecf8427e",
+                            "test.bac120.summary.tsv:md5,d41d8cd98f00b204e9800998ecf8427e"
+                        ]
+                    ]
+                ],
+                "tree": [
+                    [
+                        {
+                            "id": "test",
+                            "single_end": false,
+                            "assembler": "SPADES"
+                        },
+                        [
+                            "test.ar53.classify.tree:md5,d41d8cd98f00b204e9800998ecf8427e",
+                            "test.bac120.classify.tree:md5,d41d8cd98f00b204e9800998ecf8427e"
+                        ]
+                    ]
+                ],
+                "user_msa": [
+                    [
+                        {
+                            "id": "test",
+                            "single_end": false,
+                            "assembler": "SPADES"
+                        },
+                        "test.bac120.user_msa.fasta.gz:md5,68b329da9893e34099c7d8ad5cb9c940"
+                    ]
+                ],
+                "versions_gtdbtk": [
+                    [
+                        "GTDBTK_CLASSIFYWF",
+                        "gtdbtk",
+                        "2.6.1"
+                    ]
+                ],
+                "versions_gtdbtk_db": [
+                    [
+                        "GTDBTK_CLASSIFYWF",
+                        "gtdb_db",
+                        ""
+                    ]
+                ],
+                "warnings": [
+                    [
+                        {
+                            "id": "test",
+                            "single_end": false,
+                            "assembler": "SPADES"
+                        },
+                        "test.warnings.log:md5,d41d8cd98f00b204e9800998ecf8427e"
+                    ]
+                ]
+            }
+        ],
+        "timestamp": "2026-03-06T18:32:13.543585307",
+        "meta": {
+            "nf-test": "0.9.4",
+            "nextflow": "25.10.4"
+        }
+    }
+}
\ No newline at end of file
diff --git a/modules/nf-core/gtdbtk/classifywf/tests/nextflow.config b/modules/nf-core/gtdbtk/classifywf/tests/nextflow.config
new file mode 100644
index 00000000..4cc3ad07
--- /dev/null
+++ b/modules/nf-core/gtdbtk/classifywf/tests/nextflow.config
@@ -0,0 +1,5 @@
+process {
+    withName: GTDBTK_CLASSIFYWF {
+        ext.args = params.module_args
+    }
+}

From 066c8dc3a0fb6843fbf6046c18691a8c8abd71b4 Mon Sep 17 00:00:00 2001
From: albatalavera <albatalavera92@gmail.com>
Date: Fri, 13 Mar 2026 14:27:49 +0100
Subject: [PATCH 2/2] Add preliminary GTDB-Tk classification to bacass workflow

---
 workflows/bacass.nf | 53 ++++++++++++++++++++++++++++++++++-----------
 1 file changed, 40 insertions(+), 13 deletions(-)

diff --git a/workflows/bacass.nf b/workflows/bacass.nf
index 744f4577..6d9d49db 100644
--- a/workflows/bacass.nf
+++ b/workflows/bacass.nf
@@ -42,6 +42,7 @@ include { QUAST                                 } from '../modules/nf-core/quast
 include { QUAST as QUAST_BYREFSEQID             } from '../modules/nf-core/quast'
 include { QUAST as QUAST_BYSAMPLE               } from '../modules/nf-core/quast'
 include { BUSCO_BUSCO                           } from '../modules/nf-core/busco/busco/main'
+include { GTDBTK_CLASSIFYWF                     } from '../modules/nf-core/gtdbtk/classifywf'
 include { GUNZIP                                } from '../modules/nf-core/gunzip'
 include { PROKKA                                } from '../modules/nf-core/prokka'
 include { FILTLONG                              } from '../modules/nf-core/filtlong'
@@ -75,7 +76,7 @@ workflow BACASS {
     main:
 
     // Check input path parameters to see if they exist
-    def checkPathParamList = [ params.input, params.multiqc_config, params.kraken2db, params.dfast_config, params.reference_fasta, params.reference_gff ]
+    def checkPathParamList = [ params.input, params.multiqc_config, params.kraken2db, params.dfast_config, params.reference_fasta, params.reference_gff, params.gtdbtk_db ]
     checkPathParamList.each { param -> if (param) { file(param, checkIfExists: true) } }
 
     if (params.reference_fasta) {
@@ -778,6 +779,13 @@ workflow BACASS {
         }
         .set{ ch_assembly_for_gunzip }
 
+    ch_assembly_uncompressed = ch_assembly
+    if ((!params.skip_annotation && params.annotation_tool in ['prokka', 'bakta']) || !params.skip_gtdbtk) {
+        GUNZIP ( ch_assembly_for_gunzip.gzip )
+        ch_assembly_uncompressed = ch_assembly_for_gunzip.skip.mix( GUNZIP.out.gunzip )
+        ch_versions = ch_versions.mix( GUNZIP.out.versions )
+    }
+
     //
     // MODULE: BUSCO, assess genome assembly completeness
     //
@@ -795,18 +803,41 @@ workflow BACASS {
         ch_versions = ch_versions.mix(BUSCO_BUSCO.out.versions)
     }
 
+    //
+    // MODULE: GTDB-Tk, taxonomic classification of final assemblies
+    //
+    ch_gtdbtk_summary = channel.empty()
+    if (!params.skip_gtdbtk) {
+        if (!params.gtdbtk_db) {
+            error("GTDB-Tk requires `--gtdbtk_db` when `--skip_gtdbtk false`.")
+        }
+
+        ch_assembly_uncompressed
+            .map { meta, fasta ->
+                def new_meta = meta.clone()
+                def fasta_name = fasta.name.toLowerCase()
+                def extension = fasta_name.endsWith('.fasta') ? 'fasta' : fasta_name.endsWith('.fna') ? 'fna' : 'fa'
+                new_meta.gtdb_ext = extension
+                [ new_meta, [ fasta ] ]
+            }
+            .set { ch_gtdbtk_input }
+
+        GTDBTK_CLASSIFYWF (
+            ch_gtdbtk_input,
+            channel.value([ params.gtdbtk_db_name ?: 'gtdbtk', file(params.gtdbtk_db, checkIfExists: true) ]),
+            params.gtdbtk_use_pplacer_scratch_dir
+        )
+
+        ch_gtdbtk_summary = GTDBTK_CLASSIFYWF.out.summary
+    }
+
     //
     // MODULE: PROKKA, gene annotation
     //
     ch_prokka_txt_multiqc = channel.empty()
     if ( !params.skip_annotation && params.annotation_tool == 'prokka' ) {
-        // Uncompress assembly for annotation if necessary
-        GUNZIP ( ch_assembly_for_gunzip.gzip )
-        ch_to_prokka    = ch_assembly_for_gunzip.skip.mix( GUNZIP.out.gunzip )
-        ch_versions     = ch_versions.mix( GUNZIP.out.versions )
-
         PROKKA (
-            ch_to_prokka.filter{ _meta, fasta -> !fasta.isEmpty() },
+            ch_assembly_uncompressed.filter{ _meta, fasta -> !fasta.isEmpty() },
             ch_proteins,
             []
         )
@@ -819,13 +850,8 @@ workflow BACASS {
     //
     ch_bakta_txt_multiqc = channel.empty()
     if ( !params.skip_annotation && params.annotation_tool == 'bakta' ) {
-        // Uncompress assembly for annotation if necessary
-        GUNZIP ( ch_assembly_for_gunzip.gzip )
-        ch_to_bakta     = ch_assembly_for_gunzip.skip.mix( GUNZIP.out.gunzip )
-        ch_versions     = ch_versions.mix( GUNZIP.out.versions )
-
         BAKTA_DBDOWNLOAD_RUN (
-            ch_to_bakta.filter{ _meta, fasta -> !fasta.isEmpty() },
+            ch_assembly_uncompressed.filter{ _meta, fasta -> !fasta.isEmpty() },
             params.baktadb,
             params.baktadb_download
         )
@@ -939,6 +965,7 @@ workflow BACASS {
 
     emit:
     multiqc_report = CUSTOM_MULTIQC.out.report.toList() // channel: /path/to/multiqc_report.html
+    gtdbtk_summary = ch_gtdbtk_summary                  // channel: [ val(meta), path(summary.tsv) ]
     versions       = ch_versions                        // channel: [ path(versions.yml) ]
 
 }