ICM_Code/Run_Multiphate2.py at main · felipehcoutinho/ICM_Code · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
#! /usr/bin/env python3

from collections import defaultdict
from Bio import SeqIO
from pathlib import Path
import argparse
import subprocess
import re
import glob
import gzip

parser = argparse.ArgumentParser()
parser.add_argument("--genomes_file", help="File of genomic sequences", type =str)
parser.add_argument("--in_format", help="Input Sequence file format", default="fasta", type =str)
parser.add_argument("--threads", help="The number of threads to be used", default=1, type=int)
parser.add_argument("--parse_only", help="Flag to skip running any programs and only parse their output", default=False, type=bool)
args = parser.parse_args()


def central():
    file_prefix = index_split(genomes_file=args.genomes_file,genomes_format=args.in_format)
    config_file = prepare_config_file(prefix=file_prefix,threads=args.threads)
    call_multiphate(cf=config_file)
    protein_file = collect_proteins(prefix=file_prefix)

def collect_proteins(prefix="NA"):
    """Parse the protein.faa files generated by multiphate for each genomes. Rename proteins to be consistent with prodigal nomeclautre and usign the original seq IDs. Merge all proteins in a single fasta file"""
    fasta_outfile = prefix + ".faa"
    with open(fasta_outfile, 'w', newline='') as OUT:
        for seq_id in seq_info['Description'].keys():
            genome_oid = seq_info['Original_ID'][seq_id]
            prot_file = "/mnt/lustre/bio/users/fcoutinho/multiphate2/multiPhATE2/PipelineOutput/"+seq_id+"/protein.faa"
            seq_counter = 0
            print(f"Processing {prot_file}")
            try:
                for seqobj in SeqIO.parse(prot_file, "fasta"):
                    seq_counter += 1
                    gene_nid = seqobj.id
                    sub_ids = gene_nid.split("_phanotate_")
                    gene_num = re.sub('_geneCall','',sub_ids[1])
                    gene_oid = genome_oid+"_"+gene_num
                    seqobj.id = gene_oid
                    SeqIO.write(seqobj, OUT, "fasta")
                print(f"Processed {seq_counter} sequences")
            except:
                print(f"Exception while processing {prot_file}")
            print (f"Processed {seq_counter} sequences.")
    return(fasta_outfile)

def call_multiphate(cf="NA"):
    """Move files fron Input_Genomes/ to PipelineInput/ call multiphate2 with the previously generated configuration file"""
    if (args.parse_only == False):
        command = "mv Input_Genomes/*fasta /mnt/lustre/bio/users/fcoutinho/multiphate2/multiPhATE2/PipelineInput/"
        subprocess.call(command, shell=True)
        print("Running multiPhATE2")
        command = f"python3 /mnt/lustre/bio/users/fcoutinho/multiphate2/multiPhATE2/multiPhate.py {cf}"
        subprocess.call(command, shell=True)

def prepare_config_file(prefix="MultiPhate2",threads=1):
    """Write a configuration file for multiphate2 based on the information stored in seq_info"""
    config_file = f"{prefix}.config"
    print(f"Writing configuration file to {config_file}")
    with open(config_file, 'w', newline='') as OUT:
        OUT.write("Genome List:\n")
        genome_counter = 0
        for genome_id in seq_info['Description'].keys():
            genome_counter += 1
            OUT.write(f"Genome {genome_counter}\n")
            OUT.write(f"genome_file='{seq_info['New_File'][genome_id]}'\n")
            OUT.write("genome_type='phage'\n")
            OUT.write("genome_species='NA'\n")
        OUT.write("END of list\n")
        OUT.write("check_databases='false'\n")
        OUT.write("phanotate_calls='true'\n")
        OUT.write("prodigal_calls='false'\n")
        OUT.write("glimmer_calls='false'\n")
        OUT.write("genemarks_calls='false'\n")
        OUT.write("genemarks_path=''\n")
        OUT.write("custom_gene_calls='false'\n")
        OUT.write("primary_calls='phanotate'\n")
        OUT.write("trnascan='true'\n")
        OUT.write("genetic_code='11'\n")
        OUT.write("translate_only='true'\n")
        OUT.write("blastn_identity='60'\n")
        OUT.write("blastp_identity='60'\n")
        OUT.write("blastn_hit_count='5'\n")
        OUT.write("blastp_hit_count='5'\n")
        OUT.write("ncbi_virus_genome_blast='false'\n")
        OUT.write("vog_gene_blast='false'\n")
        OUT.write("blastp='false'\n")
        OUT.write("phmmer='false'\n")
        OUT.write("jackhmmer='false'\n")
        OUT.write("pvogs_blast='false'\n")
        OUT.write("vog_protein_blast='false'\n")
        OUT.write("phantome_blast='false'\n")
        OUT.write("kegg_virus_blast='false'\n")
        OUT.write("swissprot_blast='false'\n")
        OUT.write("refseq_protein_blast='false'\n")
        OUT.write("ncbi_virus_protein_blast='false'\n")
        OUT.write("nr_blast='false'\n")
        OUT.write("cazy_blast='false'\n")
        OUT.write("ncbi_virus_genome_database_path=''\n")
        OUT.write("ncbi_virus_protein_database_path=''\n")
        OUT.write("kegg_virus_database_path='T40000.pep'\n")
        OUT.write("phantome_database_path=''\n")
        OUT.write("pvogs_database_path=''\n")
        OUT.write("vog_gene_database_path=''\n")
        OUT.write("vog_protein_database_path=''\n")
        OUT.write("swissprot_database_path=''\n")
        OUT.write("refseq_protein_database_path=''\n")
        OUT.write("nr_database_path=''\n")
        OUT.write("cazy_database_path=''\n")
        OUT.write("cazy_annotation_path=''\n")
        OUT.write("custom_genome_blast='false'\n")
        OUT.write("custom_genome_blast_database_path=''\n")
        OUT.write("custom_gene_blast='false'\n")
        OUT.write("custom_gene_blast_database_path=''\n")
        OUT.write("custom_protein_blast='false'\n")
        OUT.write("custom_protein_blast_database_path=''\n")
        OUT.write("hmmscan='false'\n")
        OUT.write("pvogs_hmm_profiles='false'\n")
        OUT.write("vogs_hmm_profiles='false'\n")
        OUT.write("pvogs_hmm_profiles_database_path=''\n")
        OUT.write("vogs_hmm_profiles_database_path=''\n")
        OUT.write("CGP='false'\n")
        OUT.write("cgp_identity_cutoff_gene='60'\n")
        OUT.write("cgp_identity_cutoff_protein='60'\n")
        OUT.write("HPC='false'\n")
        OUT.write(f"phate_threads='{threads}'\n")
        OUT.write("blast_threads='0'\n")
        OUT.write("cgp_threads='0' \n")
        OUT.write("checkpoint_phate='false'\n")
        OUT.write("checkpoint_cgp='false'\n")
        OUT.write("checkpoint_genomics='false'\n")
        OUT.write("phate_warnings='false'\n")
        OUT.write("phate_messages='false'\n")
        OUT.write("phate_progress='true'\n")
        OUT.write("clean_raw_data='true'\n")
    return(config_file)

def get_prefix(file,file_format):
    file_prefix = re.sub(f'.{file_format}','',file)
    file_prefix = re.sub('(.)+/','',file_prefix)
    return file_prefix

def index_split(genomes_file="NA",genomes_format="NA"):
    """Collect basic info about genomic sequences in the input file. Split the sequences as one sequence per file in fasta format"""
    print ("Indexing and splitting genomic sequences from",genomes_file)
    seq_counter = 0
    #Creat the directory to store the input genome fasta files
    Path("Input_Genomes").mkdir(parents=True, exist_ok=True)
    #Iterate over genomic sequences in the file. Collect basic Info
    for seqobj in SeqIO.parse(genomes_file, genomes_format):
        seq_counter += 1
        #Rename sequences to contain only alphanumerical charachters
        new_id = seqobj.id
        new_id = re.sub('\W','_',new_id)
        seq_info['Original_ID'][new_id] = seqobj.id
        seqobj.id = new_id
        #Do not allow duplicated IDs
        if (seqobj.id in seq_info['Description']):
            raise Exception(f'Duplicated ID: {seqobj.id} in {genome_file}')
        #Store sequence info in seq_info dict
        seq_info['Description'][seqobj.id] = seqobj.description
        seqobj.description = ""
        seq_info['Length'][seqobj.id] = len(seqobj.seq)
        seq_info['Original_File'][seqobj.id] = genomes_file
        seq_info['New_File'][seqobj.id] = seqobj.id + ".fasta"
        #Put each sequence in a single sequence fasta file
        fasta_outfile = "Input_Genomes/" + seqobj.id + ".fasta"
        with open(fasta_outfile, 'w', newline='') as OUT:
            SeqIO.write(seqobj, OUT, "fasta")
    print (f"Processed {seq_counter} sequences.")
    file_prefix = get_prefix(genomes_file, genomes_format)
    return(file_prefix)


seq_info = defaultdict(dict)
central()