From b24103105b74dead8ce1f079901609d43c861ceb Mon Sep 17 00:00:00 2001 From: albatalavera Date: Fri, 13 Mar 2026 13:18:43 +0100 Subject: [PATCH 1/2] Add nf-core/gtdbtk classifywf module --- .../nf-core/gtdbtk/classifywf/environment.yml | 8 + modules/nf-core/gtdbtk/classifywf/main.nf | 80 ++++ modules/nf-core/gtdbtk/classifywf/meta.yml | 226 +++++++++++ .../gtdbtk/classifywf/tests/main.nf.test | 104 ++++++ .../gtdbtk/classifywf/tests/main.nf.test.snap | 350 ++++++++++++++++++ .../gtdbtk/classifywf/tests/nextflow.config | 5 + 6 files changed, 773 insertions(+) create mode 100644 modules/nf-core/gtdbtk/classifywf/environment.yml create mode 100644 modules/nf-core/gtdbtk/classifywf/main.nf create mode 100644 modules/nf-core/gtdbtk/classifywf/meta.yml create mode 100644 modules/nf-core/gtdbtk/classifywf/tests/main.nf.test create mode 100644 modules/nf-core/gtdbtk/classifywf/tests/main.nf.test.snap create mode 100644 modules/nf-core/gtdbtk/classifywf/tests/nextflow.config diff --git a/modules/nf-core/gtdbtk/classifywf/environment.yml b/modules/nf-core/gtdbtk/classifywf/environment.yml new file mode 100644 index 00000000..73c3f9c5 --- /dev/null +++ b/modules/nf-core/gtdbtk/classifywf/environment.yml @@ -0,0 +1,8 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - bioconda::gtdbtk=2.6.1 + - conda-forge::python=3.13.12 diff --git a/modules/nf-core/gtdbtk/classifywf/main.nf b/modules/nf-core/gtdbtk/classifywf/main.nf new file mode 100644 index 00000000..e7398405 --- /dev/null +++ b/modules/nf-core/gtdbtk/classifywf/main.nf @@ -0,0 +1,80 @@ +process GTDBTK_CLASSIFYWF { + tag "${meta.id}" + label 'process_high_memory' + + conda "${moduleDir}/environment.yml" + container "${workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container + ? 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/c2/c2df03eec9c0805810e0ef6caec4347d7c6545eece61e941018945502fafc9b6/data' + : 'community.wave.seqera.io/library/gtdbtk_python:cee0379cf1ca2968'}" + + input: + tuple val(meta) , path("bins/*") + tuple val(db_name), path(db) + val use_pplacer_scratch_dir + + output: + tuple val(meta), path("${prefix}") , emit: gtdb_outdir + tuple val(meta), path("${prefix}/classify/*.summary.tsv") , emit: summary + tuple val(meta), path("${prefix}/classify/*.classify.tree") , emit: tree , optional: true + tuple val(meta), path("${prefix}/identify/*.markers_summary.tsv"), emit: markers , optional: true + tuple val(meta), path("${prefix}/align/*.msa.fasta.gz") , emit: msa , optional: true + tuple val(meta), path("${prefix}/align/*.user_msa.fasta.gz") , emit: user_msa , optional: true + tuple val(meta), path("${prefix}/align/*.filtered.tsv") , emit: filtered , optional: true + tuple val(meta), path("${prefix}/identify/*.failed_genomes.tsv") , emit: failed , optional: true + tuple val(meta), path("${prefix}/${prefix}.log") , emit: log + tuple val(meta), path("${prefix}/${prefix}.warnings.log") , emit: warnings + tuple val("${task.process}"), val('gtdbtk'), eval("gtdbtk --version 2>&1 | grep -Eo '[0-9]+(\\.[0-9]+)+' | head -1") , topic: versions, emit: versions_gtdbtk + tuple val("${task.process}"), val('gtdb_db'), eval('grep VERSION_DATA $GTDBTK_DATA_PATH/metadata/metadata.txt | sed "s/VERSION_DATA=//"'), topic: versions, emit: versions_gtdbtk_db + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}" + def pplacer_scratch = use_pplacer_scratch_dir ? "--scratch_dir pplacer_tmp" : "" + """ + export GTDBTK_DATA_PATH="\$(find -L ${db} -name 'metadata' -type d -exec dirname {} \\;)" + + if [ "${pplacer_scratch}" != "" ] ; then + mkdir pplacer_tmp + fi + + gtdbtk classify_wf \\ + ${args} \\ + --genome_dir bins \\ + --prefix "${prefix}" \\ + --out_dir ${prefix} \\ + --cpus ${task.cpus} \\ + ${pplacer_scratch} + + mv ${prefix}/gtdbtk.log "${prefix}/${prefix}.log" + mv ${prefix}/gtdbtk.warnings.log "${prefix}/${prefix}.warnings.log" + """ + + stub: + prefix = task.ext.prefix ?: "${meta.id}" + """ + mkdir ${prefix} + mkdir ${prefix}/identify + mkdir ${prefix}/classify + mkdir ${prefix}/align + + touch ${prefix}/classify/${prefix}.ar53.summary.tsv + touch ${prefix}/classify/${prefix}.bac120.summary.tsv + touch ${prefix}/classify/${prefix}.ar53.classify.tree + touch ${prefix}/classify/${prefix}.bac120.classify.tree + + touch ${prefix}/identify/${prefix}.ar53.markers_summary.tsv + touch ${prefix}/identify/${prefix}.bac120.markers_summary.tsv + + echo "" | gzip > ${prefix}/align/${prefix}.ar53.msa.fasta.gz + echo "" | gzip > ${prefix}/align/${prefix}.bac120.user_msa.fasta.gz + touch ${prefix}/align/${prefix}.ar53.filtered.tsv + touch ${prefix}/align/${prefix}.bac120.filtered.tsv + + touch ${prefix}/${prefix}.log + touch ${prefix}/${prefix}.warnings.log + touch ${prefix}/${prefix}.failed_genomes.tsv + """ +} diff --git a/modules/nf-core/gtdbtk/classifywf/meta.yml b/modules/nf-core/gtdbtk/classifywf/meta.yml new file mode 100644 index 00000000..1c583c32 --- /dev/null +++ b/modules/nf-core/gtdbtk/classifywf/meta.yml @@ -0,0 +1,226 @@ +name: gtdbtk_classifywf +description: GTDB-Tk is a software toolkit for assigning objective taxonomic classifications + to bacterial and archaeal genomes based on the Genome Database Taxonomy GTDB. +keywords: + - GTDB taxonomy + - taxonomic classification + - metagenomics + - classification + - genome taxonomy database + - bacteria + - archaea +tools: + - gtdbtk: + description: GTDB-Tk is a software toolkit for assigning objective taxonomic classifications + to bacterial and archaeal genomes based on the Genome Database Taxonomy GTDB. + homepage: https://ecogenomics.github.io/GTDBTk/ + documentation: https://ecogenomics.github.io/GTDBTk/ + tool_dev_url: https://github.com/Ecogenomics/GTDBTk + doi: "10.1093/bioinformatics/btz848" + licence: ["GNU General Public v3 (GPL v3)"] + identifier: "" +input: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false, assembler:'spades' ] + - bins/*: + type: file + description: A list of one or more bins in FASTA format for classification + pattern: "*.{fasta,fna,fas,fa}{,.gz}" + ontologies: + - edam: http://edamontology.org/format_1929 # FASTA + - - db_name: + type: string + description: The name of the GTDB database to use. + - db: + type: file + description: | + Path to a directory containing a GDTB database, as uncompressed from from the 'full package' gtdbdtk_data.tar.gz file. + You can give the 'release' directory here. + Must contain the 'metadata' subdirectory + pattern: "release[0-9]+/" + ontologies: [] + - use_pplacer_scratch_dir: + type: boolean + description: Set to true to reduce pplacer memory usage by writing to disk (slower) +output: + gtdb_outdir: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false, assembler:'spades' ] + pattern: "*" + - ${prefix}: + type: directory + description: All files output by GTDB-Tk + pattern: "*" + summary: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false, assembler:'spades' ] + pattern: "*" + - ${prefix}/classify/*.summary.tsv: + type: file + description: A TSV summary file for the classification + pattern: "*.{summary.tsv}" + ontologies: + - edam: http://edamontology.org/format_3475 # TSV + tree: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false, assembler:'spades' ] + pattern: "*" + - ${prefix}/classify/*.classify.tree: + type: file + description: | + Groovy Map NJ or UPGMA trees in Newick format produced from a multiple sequence + alignment + pattern: "*.{classify.tree}" + ontologies: + - edam: http://edamontology.org/format_1910 # newick + markers: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false, assembler:'spades' ] + pattern: "*" + - ${prefix}/identify/*.markers_summary.tsv: + type: file + description: A TSV summary file lineage markers used for the classification. + pattern: "*.{markers_summary.tsv}" + ontologies: + - edam: http://edamontology.org/format_3475 # TSV + msa: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false, assembler:'spades' ] + pattern: "*" + - ${prefix}/align/*.msa.fasta.gz: + type: file + description: Multiple sequence alignments file. + pattern: "*.{msa.fasta.gz}" + ontologies: + - edam: http://edamontology.org/format_1929 # FASTA + user_msa: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false, assembler:'spades' ] + pattern: "*" + - ${prefix}/align/*.user_msa.fasta.gz: + type: file + description: Multiple sequence alignments file for the user-provided files. + pattern: "*.{user_msa.fasta.gz}" + ontologies: + - edam: http://edamontology.org/format_1929 # FASTA + filtered: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false, assembler:'spades' ] + pattern: "*" + - ${prefix}/align/*.filtered.tsv: + type: file + description: A list of genomes with an insufficient number of amino acids + in MSA + pattern: "*.{filtered.tsv}" + ontologies: + - edam: http://edamontology.org/format_3475 # TSV + failed: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false, assembler:'spades' ] + pattern: "*" + - ${prefix}/identify/*.failed_genomes.tsv: + type: file + description: A TSV summary of the genomes which GTDB-tk failed to classify. + pattern: "*.{failed_genomes.tsv}" + ontologies: + - edam: http://edamontology.org/format_3475 # TSV + log: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false, assembler:'spades' ] + pattern: "*" + - ${prefix}/${prefix}.log: + type: file + description: GTDB-tk log file + pattern: "*.{log}" + ontologies: + - edam: http://edamontology.org/data_3671 # Text + warnings: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false, assembler:'spades' ] + pattern: "*" + - ${prefix}/${prefix}.warnings.log: + type: file + description: GTDB-tk warnings log file + pattern: "*.{warnings.log}" + ontologies: + - edam: http://edamontology.org/data_3671 # Text + versions_gtdbtk: + - - ${task.process}: + type: string + description: The name of the process + - gtdbtk: + type: string + description: The name of the tool + - "gtdbtk --version 2>&1 | grep -Eo '[0-9]+(\\.[0-9]+)+' | head -1": + type: eval + description: The expression to obtain the version of the tool + versions_gtdbtk_db: + - - ${task.process}: + type: string + description: The name of the process + - gtdb_db: + type: string + description: The name of the database + - 'grep VERSION_DATA $GTDBTK_DATA_PATH/metadata/metadata.txt | sed "s/VERSION_DATA=//"': + type: eval + description: The expression to obtain the version of the database +topics: + versions: + - - ${task.process}: + type: string + description: The name of the process + - gtdbtk: + type: string + description: The name of the tool + - "gtdbtk --version 2>&1 | grep -Eo '[0-9]+(\\.[0-9]+)+' | head -1": + type: eval + description: The expression to obtain the version of the tool + - - ${task.process}: + type: string + description: The name of the process + - gtdb_db: + type: string + description: The name of the database + - 'grep VERSION_DATA $GTDBTK_DATA_PATH/metadata/metadata.txt | sed "s/VERSION_DATA=//"': + type: eval + description: The expression to obtain the version of the database +authors: + - "@skrakau" + - "@prototaxites" + - "@abhi18av" +maintainers: + - "@skrakau" + - "@abhi18av" diff --git a/modules/nf-core/gtdbtk/classifywf/tests/main.nf.test b/modules/nf-core/gtdbtk/classifywf/tests/main.nf.test new file mode 100644 index 00000000..41f0aba3 --- /dev/null +++ b/modules/nf-core/gtdbtk/classifywf/tests/main.nf.test @@ -0,0 +1,104 @@ +nextflow_process { + + name "Test Process GTDBTK_CLASSIFYWF" + script "../main.nf" + process "GTDBTK_CLASSIFYWF" + config './nextflow.config' + + tag "modules" + tag "modules_nfcore" + tag "gtdbtk" + tag "gtdbtk/classifywf" + tag "untar" + + setup { + run("UNTAR") { + script "../../../untar/main.nf" + process { + """ + input[0] = [ + [ id:'mockup' ], // meta map + file('https://github.com/nf-core/test-datasets/raw/refs/heads/mag/databases/gtdbtk/gtdbtk_mockup_20250422.tar.gz', checkIfExists: true) + ] + """ + } + } + } + + // Using special mini test-data provided to use via GTDB developers + test("E. coli - genome fasta") { + + when { + + params { + // Recommended by the GTDB developers: https://data.gtdb.ecogenomic.org/releases/latest/auxillary_files/gtdbtk_package/mockup_db/HOWTO.txt + // The `--skip_ani_screen` requires comparison against a full database and goes through the full classfywf pipeline + module_args = '--extension fa --skip_ani_screen' + } + + process { + """ + input[0] = [ + [ id:'test', single_end:false, assembler:'SPADES' ], + [ + file(params.modules_testdata_base_path + 'genomics/prokaryotes/escherichia_coli/genome/genome.fa', checkIfExists: true), + ] + ] + input[1] = UNTAR.out.untar + input[2] = false + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + process.out.summary, + process.out.tree, + process.out.markers, + process.out.msa, + process.out.user_msa, + process.out.filtered, + file(process.out.log[0][1]).readLines().contains('INFO: Done.'), + process.out.versions_gtdbtk, + process.out.versions_gtdbtk_db, + ).match() } + ) + } + } + + test("sarscov2 - genome fasta - stub") { + + options "-stub" + + when { + + params { + // Recommended by the GTDB developers: https://data.gtdb.ecogenomic.org/releases/latest/auxillary_files/gtdbtk_package/mockup_db/HOWTO.txt + // The `--skip_ani_screen` requires comparison against a full database and goes through the full classfywf pipeline + module_args = '--extension fa --skip_ani_screen' + } + + process { + """ + input[0] = [ + [ id:'test', single_end:false, assembler:'SPADES' ], + [ + file(params.modules_testdata_base_path + 'genomics/prokaryotes/escherichia_coli/genome/genome.fa', checkIfExists: true), + ] + ] + input[1] = [[:],[]] + input[2] = false + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } +} diff --git a/modules/nf-core/gtdbtk/classifywf/tests/main.nf.test.snap b/modules/nf-core/gtdbtk/classifywf/tests/main.nf.test.snap new file mode 100644 index 00000000..6d902757 --- /dev/null +++ b/modules/nf-core/gtdbtk/classifywf/tests/main.nf.test.snap @@ -0,0 +1,350 @@ +{ + "E. coli - genome fasta": { + "content": [ + [ + [ + { + "id": "test", + "single_end": false, + "assembler": "SPADES" + }, + "test.bac120.summary.tsv:md5,6d78949595cbfb145108cc5374dbac23" + ] + ], + [ + + ], + [ + [ + { + "id": "test", + "single_end": false, + "assembler": "SPADES" + }, + [ + "test.ar53.markers_summary.tsv:md5,3ba12aa91791ee263df1bee8558413eb", + "test.bac120.markers_summary.tsv:md5,49c710e91fe35aff2ee5bf6b1b949ed4" + ] + ] + ], + [ + + ], + [ + + ], + [ + + ], + false, + [ + [ + "GTDBTK_CLASSIFYWF", + "gtdbtk", + "2.6.1" + ] + ], + [ + [ + "GTDBTK_CLASSIFYWF", + "gtdb_db", + "r226" + ] + ] + ], + "timestamp": "2026-03-06T18:32:07.523214921", + "meta": { + "nf-test": "0.9.4", + "nextflow": "25.10.4" + } + }, + "sarscov2 - genome fasta - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false, + "assembler": "SPADES" + }, + [ + [ + "test.ar53.filtered.tsv:md5,d41d8cd98f00b204e9800998ecf8427e", + "test.ar53.msa.fasta.gz:md5,68b329da9893e34099c7d8ad5cb9c940", + "test.bac120.filtered.tsv:md5,d41d8cd98f00b204e9800998ecf8427e", + "test.bac120.user_msa.fasta.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ], + [ + "test.ar53.classify.tree:md5,d41d8cd98f00b204e9800998ecf8427e", + "test.ar53.summary.tsv:md5,d41d8cd98f00b204e9800998ecf8427e", + "test.bac120.classify.tree:md5,d41d8cd98f00b204e9800998ecf8427e", + "test.bac120.summary.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ], + [ + "test.ar53.markers_summary.tsv:md5,d41d8cd98f00b204e9800998ecf8427e", + "test.bac120.markers_summary.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ], + "test.failed_genomes.tsv:md5,d41d8cd98f00b204e9800998ecf8427e", + "test.log:md5,d41d8cd98f00b204e9800998ecf8427e", + "test.warnings.log:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ], + "1": [ + [ + { + "id": "test", + "single_end": false, + "assembler": "SPADES" + }, + [ + "test.ar53.summary.tsv:md5,d41d8cd98f00b204e9800998ecf8427e", + "test.bac120.summary.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ], + "10": [ + [ + "GTDBTK_CLASSIFYWF", + "gtdbtk", + "2.6.1" + ] + ], + "11": [ + [ + "GTDBTK_CLASSIFYWF", + "gtdb_db", + "" + ] + ], + "2": [ + [ + { + "id": "test", + "single_end": false, + "assembler": "SPADES" + }, + [ + "test.ar53.classify.tree:md5,d41d8cd98f00b204e9800998ecf8427e", + "test.bac120.classify.tree:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ], + "3": [ + [ + { + "id": "test", + "single_end": false, + "assembler": "SPADES" + }, + [ + "test.ar53.markers_summary.tsv:md5,d41d8cd98f00b204e9800998ecf8427e", + "test.bac120.markers_summary.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ], + "4": [ + [ + { + "id": "test", + "single_end": false, + "assembler": "SPADES" + }, + "test.ar53.msa.fasta.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "5": [ + [ + { + "id": "test", + "single_end": false, + "assembler": "SPADES" + }, + "test.bac120.user_msa.fasta.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "6": [ + [ + { + "id": "test", + "single_end": false, + "assembler": "SPADES" + }, + [ + "test.ar53.filtered.tsv:md5,d41d8cd98f00b204e9800998ecf8427e", + "test.bac120.filtered.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ], + "7": [ + + ], + "8": [ + [ + { + "id": "test", + "single_end": false, + "assembler": "SPADES" + }, + "test.log:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "9": [ + [ + { + "id": "test", + "single_end": false, + "assembler": "SPADES" + }, + "test.warnings.log:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "failed": [ + + ], + "filtered": [ + [ + { + "id": "test", + "single_end": false, + "assembler": "SPADES" + }, + [ + "test.ar53.filtered.tsv:md5,d41d8cd98f00b204e9800998ecf8427e", + "test.bac120.filtered.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ], + "gtdb_outdir": [ + [ + { + "id": "test", + "single_end": false, + "assembler": "SPADES" + }, + [ + [ + "test.ar53.filtered.tsv:md5,d41d8cd98f00b204e9800998ecf8427e", + "test.ar53.msa.fasta.gz:md5,68b329da9893e34099c7d8ad5cb9c940", + "test.bac120.filtered.tsv:md5,d41d8cd98f00b204e9800998ecf8427e", + "test.bac120.user_msa.fasta.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ], + [ + "test.ar53.classify.tree:md5,d41d8cd98f00b204e9800998ecf8427e", + "test.ar53.summary.tsv:md5,d41d8cd98f00b204e9800998ecf8427e", + "test.bac120.classify.tree:md5,d41d8cd98f00b204e9800998ecf8427e", + "test.bac120.summary.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ], + [ + "test.ar53.markers_summary.tsv:md5,d41d8cd98f00b204e9800998ecf8427e", + "test.bac120.markers_summary.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ], + "test.failed_genomes.tsv:md5,d41d8cd98f00b204e9800998ecf8427e", + "test.log:md5,d41d8cd98f00b204e9800998ecf8427e", + "test.warnings.log:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ], + "log": [ + [ + { + "id": "test", + "single_end": false, + "assembler": "SPADES" + }, + "test.log:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "markers": [ + [ + { + "id": "test", + "single_end": false, + "assembler": "SPADES" + }, + [ + "test.ar53.markers_summary.tsv:md5,d41d8cd98f00b204e9800998ecf8427e", + "test.bac120.markers_summary.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ], + "msa": [ + [ + { + "id": "test", + "single_end": false, + "assembler": "SPADES" + }, + "test.ar53.msa.fasta.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "summary": [ + [ + { + "id": "test", + "single_end": false, + "assembler": "SPADES" + }, + [ + "test.ar53.summary.tsv:md5,d41d8cd98f00b204e9800998ecf8427e", + "test.bac120.summary.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ], + "tree": [ + [ + { + "id": "test", + "single_end": false, + "assembler": "SPADES" + }, + [ + "test.ar53.classify.tree:md5,d41d8cd98f00b204e9800998ecf8427e", + "test.bac120.classify.tree:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ], + "user_msa": [ + [ + { + "id": "test", + "single_end": false, + "assembler": "SPADES" + }, + "test.bac120.user_msa.fasta.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "versions_gtdbtk": [ + [ + "GTDBTK_CLASSIFYWF", + "gtdbtk", + "2.6.1" + ] + ], + "versions_gtdbtk_db": [ + [ + "GTDBTK_CLASSIFYWF", + "gtdb_db", + "" + ] + ], + "warnings": [ + [ + { + "id": "test", + "single_end": false, + "assembler": "SPADES" + }, + "test.warnings.log:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + } + ], + "timestamp": "2026-03-06T18:32:13.543585307", + "meta": { + "nf-test": "0.9.4", + "nextflow": "25.10.4" + } + } +} \ No newline at end of file diff --git a/modules/nf-core/gtdbtk/classifywf/tests/nextflow.config b/modules/nf-core/gtdbtk/classifywf/tests/nextflow.config new file mode 100644 index 00000000..4cc3ad07 --- /dev/null +++ b/modules/nf-core/gtdbtk/classifywf/tests/nextflow.config @@ -0,0 +1,5 @@ +process { + withName: GTDBTK_CLASSIFYWF { + ext.args = params.module_args + } +} From 066c8dc3a0fb6843fbf6046c18691a8c8abd71b4 Mon Sep 17 00:00:00 2001 From: albatalavera Date: Fri, 13 Mar 2026 14:27:49 +0100 Subject: [PATCH 2/2] Add preliminary GTDB-Tk classification to bacass workflow --- workflows/bacass.nf | 53 ++++++++++++++++++++++++++++++++++----------- 1 file changed, 40 insertions(+), 13 deletions(-) diff --git a/workflows/bacass.nf b/workflows/bacass.nf index 744f4577..6d9d49db 100644 --- a/workflows/bacass.nf +++ b/workflows/bacass.nf @@ -42,6 +42,7 @@ include { QUAST } from '../modules/nf-core/quast include { QUAST as QUAST_BYREFSEQID } from '../modules/nf-core/quast' include { QUAST as QUAST_BYSAMPLE } from '../modules/nf-core/quast' include { BUSCO_BUSCO } from '../modules/nf-core/busco/busco/main' +include { GTDBTK_CLASSIFYWF } from '../modules/nf-core/gtdbtk/classifywf' include { GUNZIP } from '../modules/nf-core/gunzip' include { PROKKA } from '../modules/nf-core/prokka' include { FILTLONG } from '../modules/nf-core/filtlong' @@ -75,7 +76,7 @@ workflow BACASS { main: // Check input path parameters to see if they exist - def checkPathParamList = [ params.input, params.multiqc_config, params.kraken2db, params.dfast_config, params.reference_fasta, params.reference_gff ] + def checkPathParamList = [ params.input, params.multiqc_config, params.kraken2db, params.dfast_config, params.reference_fasta, params.reference_gff, params.gtdbtk_db ] checkPathParamList.each { param -> if (param) { file(param, checkIfExists: true) } } if (params.reference_fasta) { @@ -778,6 +779,13 @@ workflow BACASS { } .set{ ch_assembly_for_gunzip } + ch_assembly_uncompressed = ch_assembly + if ((!params.skip_annotation && params.annotation_tool in ['prokka', 'bakta']) || !params.skip_gtdbtk) { + GUNZIP ( ch_assembly_for_gunzip.gzip ) + ch_assembly_uncompressed = ch_assembly_for_gunzip.skip.mix( GUNZIP.out.gunzip ) + ch_versions = ch_versions.mix( GUNZIP.out.versions ) + } + // // MODULE: BUSCO, assess genome assembly completeness // @@ -795,18 +803,41 @@ workflow BACASS { ch_versions = ch_versions.mix(BUSCO_BUSCO.out.versions) } + // + // MODULE: GTDB-Tk, taxonomic classification of final assemblies + // + ch_gtdbtk_summary = channel.empty() + if (!params.skip_gtdbtk) { + if (!params.gtdbtk_db) { + error("GTDB-Tk requires `--gtdbtk_db` when `--skip_gtdbtk false`.") + } + + ch_assembly_uncompressed + .map { meta, fasta -> + def new_meta = meta.clone() + def fasta_name = fasta.name.toLowerCase() + def extension = fasta_name.endsWith('.fasta') ? 'fasta' : fasta_name.endsWith('.fna') ? 'fna' : 'fa' + new_meta.gtdb_ext = extension + [ new_meta, [ fasta ] ] + } + .set { ch_gtdbtk_input } + + GTDBTK_CLASSIFYWF ( + ch_gtdbtk_input, + channel.value([ params.gtdbtk_db_name ?: 'gtdbtk', file(params.gtdbtk_db, checkIfExists: true) ]), + params.gtdbtk_use_pplacer_scratch_dir + ) + + ch_gtdbtk_summary = GTDBTK_CLASSIFYWF.out.summary + } + // // MODULE: PROKKA, gene annotation // ch_prokka_txt_multiqc = channel.empty() if ( !params.skip_annotation && params.annotation_tool == 'prokka' ) { - // Uncompress assembly for annotation if necessary - GUNZIP ( ch_assembly_for_gunzip.gzip ) - ch_to_prokka = ch_assembly_for_gunzip.skip.mix( GUNZIP.out.gunzip ) - ch_versions = ch_versions.mix( GUNZIP.out.versions ) - PROKKA ( - ch_to_prokka.filter{ _meta, fasta -> !fasta.isEmpty() }, + ch_assembly_uncompressed.filter{ _meta, fasta -> !fasta.isEmpty() }, ch_proteins, [] ) @@ -819,13 +850,8 @@ workflow BACASS { // ch_bakta_txt_multiqc = channel.empty() if ( !params.skip_annotation && params.annotation_tool == 'bakta' ) { - // Uncompress assembly for annotation if necessary - GUNZIP ( ch_assembly_for_gunzip.gzip ) - ch_to_bakta = ch_assembly_for_gunzip.skip.mix( GUNZIP.out.gunzip ) - ch_versions = ch_versions.mix( GUNZIP.out.versions ) - BAKTA_DBDOWNLOAD_RUN ( - ch_to_bakta.filter{ _meta, fasta -> !fasta.isEmpty() }, + ch_assembly_uncompressed.filter{ _meta, fasta -> !fasta.isEmpty() }, params.baktadb, params.baktadb_download ) @@ -939,6 +965,7 @@ workflow BACASS { emit: multiqc_report = CUSTOM_MULTIQC.out.report.toList() // channel: /path/to/multiqc_report.html + gtdbtk_summary = ch_gtdbtk_summary // channel: [ val(meta), path(summary.tsv) ] versions = ch_versions // channel: [ path(versions.yml) ] }