-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathRun_Multiphate2.py
More file actions
179 lines (168 loc) · 8.48 KB
/
Run_Multiphate2.py
File metadata and controls
179 lines (168 loc) · 8.48 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
#! /usr/bin/env python3
from collections import defaultdict
from Bio import SeqIO
from pathlib import Path
import argparse
import subprocess
import re
import glob
import gzip
parser = argparse.ArgumentParser()
parser.add_argument("--genomes_file", help="File of genomic sequences", type =str)
parser.add_argument("--in_format", help="Input Sequence file format", default="fasta", type =str)
parser.add_argument("--threads", help="The number of threads to be used", default=1, type=int)
parser.add_argument("--parse_only", help="Flag to skip running any programs and only parse their output", default=False, type=bool)
args = parser.parse_args()
def central():
file_prefix = index_split(genomes_file=args.genomes_file,genomes_format=args.in_format)
config_file = prepare_config_file(prefix=file_prefix,threads=args.threads)
call_multiphate(cf=config_file)
protein_file = collect_proteins(prefix=file_prefix)
def collect_proteins(prefix="NA"):
"""Parse the protein.faa files generated by multiphate for each genomes. Rename proteins to be consistent with prodigal nomeclautre and usign the original seq IDs. Merge all proteins in a single fasta file"""
fasta_outfile = prefix + ".faa"
with open(fasta_outfile, 'w', newline='') as OUT:
for seq_id in seq_info['Description'].keys():
genome_oid = seq_info['Original_ID'][seq_id]
prot_file = "/mnt/lustre/bio/users/fcoutinho/multiphate2/multiPhATE2/PipelineOutput/"+seq_id+"/protein.faa"
seq_counter = 0
print(f"Processing {prot_file}")
try:
for seqobj in SeqIO.parse(prot_file, "fasta"):
seq_counter += 1
gene_nid = seqobj.id
sub_ids = gene_nid.split("_phanotate_")
gene_num = re.sub('_geneCall','',sub_ids[1])
gene_oid = genome_oid+"_"+gene_num
seqobj.id = gene_oid
SeqIO.write(seqobj, OUT, "fasta")
print(f"Processed {seq_counter} sequences")
except:
print(f"Exception while processing {prot_file}")
print (f"Processed {seq_counter} sequences.")
return(fasta_outfile)
def call_multiphate(cf="NA"):
"""Move files fron Input_Genomes/ to PipelineInput/ call multiphate2 with the previously generated configuration file"""
if (args.parse_only == False):
command = "mv Input_Genomes/*fasta /mnt/lustre/bio/users/fcoutinho/multiphate2/multiPhATE2/PipelineInput/"
subprocess.call(command, shell=True)
print("Running multiPhATE2")
command = f"python3 /mnt/lustre/bio/users/fcoutinho/multiphate2/multiPhATE2/multiPhate.py {cf}"
subprocess.call(command, shell=True)
def prepare_config_file(prefix="MultiPhate2",threads=1):
"""Write a configuration file for multiphate2 based on the information stored in seq_info"""
config_file = f"{prefix}.config"
print(f"Writing configuration file to {config_file}")
with open(config_file, 'w', newline='') as OUT:
OUT.write("Genome List:\n")
genome_counter = 0
for genome_id in seq_info['Description'].keys():
genome_counter += 1
OUT.write(f"Genome {genome_counter}\n")
OUT.write(f"genome_file='{seq_info['New_File'][genome_id]}'\n")
OUT.write("genome_type='phage'\n")
OUT.write("genome_species='NA'\n")
OUT.write("END of list\n")
OUT.write("check_databases='false'\n")
OUT.write("phanotate_calls='true'\n")
OUT.write("prodigal_calls='false'\n")
OUT.write("glimmer_calls='false'\n")
OUT.write("genemarks_calls='false'\n")
OUT.write("genemarks_path=''\n")
OUT.write("custom_gene_calls='false'\n")
OUT.write("primary_calls='phanotate'\n")
OUT.write("trnascan='true'\n")
OUT.write("genetic_code='11'\n")
OUT.write("translate_only='true'\n")
OUT.write("blastn_identity='60'\n")
OUT.write("blastp_identity='60'\n")
OUT.write("blastn_hit_count='5'\n")
OUT.write("blastp_hit_count='5'\n")
OUT.write("ncbi_virus_genome_blast='false'\n")
OUT.write("vog_gene_blast='false'\n")
OUT.write("blastp='false'\n")
OUT.write("phmmer='false'\n")
OUT.write("jackhmmer='false'\n")
OUT.write("pvogs_blast='false'\n")
OUT.write("vog_protein_blast='false'\n")
OUT.write("phantome_blast='false'\n")
OUT.write("kegg_virus_blast='false'\n")
OUT.write("swissprot_blast='false'\n")
OUT.write("refseq_protein_blast='false'\n")
OUT.write("ncbi_virus_protein_blast='false'\n")
OUT.write("nr_blast='false'\n")
OUT.write("cazy_blast='false'\n")
OUT.write("ncbi_virus_genome_database_path=''\n")
OUT.write("ncbi_virus_protein_database_path=''\n")
OUT.write("kegg_virus_database_path='T40000.pep'\n")
OUT.write("phantome_database_path=''\n")
OUT.write("pvogs_database_path=''\n")
OUT.write("vog_gene_database_path=''\n")
OUT.write("vog_protein_database_path=''\n")
OUT.write("swissprot_database_path=''\n")
OUT.write("refseq_protein_database_path=''\n")
OUT.write("nr_database_path=''\n")
OUT.write("cazy_database_path=''\n")
OUT.write("cazy_annotation_path=''\n")
OUT.write("custom_genome_blast='false'\n")
OUT.write("custom_genome_blast_database_path=''\n")
OUT.write("custom_gene_blast='false'\n")
OUT.write("custom_gene_blast_database_path=''\n")
OUT.write("custom_protein_blast='false'\n")
OUT.write("custom_protein_blast_database_path=''\n")
OUT.write("hmmscan='false'\n")
OUT.write("pvogs_hmm_profiles='false'\n")
OUT.write("vogs_hmm_profiles='false'\n")
OUT.write("pvogs_hmm_profiles_database_path=''\n")
OUT.write("vogs_hmm_profiles_database_path=''\n")
OUT.write("CGP='false'\n")
OUT.write("cgp_identity_cutoff_gene='60'\n")
OUT.write("cgp_identity_cutoff_protein='60'\n")
OUT.write("HPC='false'\n")
OUT.write(f"phate_threads='{threads}'\n")
OUT.write("blast_threads='0'\n")
OUT.write("cgp_threads='0' \n")
OUT.write("checkpoint_phate='false'\n")
OUT.write("checkpoint_cgp='false'\n")
OUT.write("checkpoint_genomics='false'\n")
OUT.write("phate_warnings='false'\n")
OUT.write("phate_messages='false'\n")
OUT.write("phate_progress='true'\n")
OUT.write("clean_raw_data='true'\n")
return(config_file)
def get_prefix(file,file_format):
file_prefix = re.sub(f'.{file_format}','',file)
file_prefix = re.sub('(.)+/','',file_prefix)
return file_prefix
def index_split(genomes_file="NA",genomes_format="NA"):
"""Collect basic info about genomic sequences in the input file. Split the sequences as one sequence per file in fasta format"""
print ("Indexing and splitting genomic sequences from",genomes_file)
seq_counter = 0
#Creat the directory to store the input genome fasta files
Path("Input_Genomes").mkdir(parents=True, exist_ok=True)
#Iterate over genomic sequences in the file. Collect basic Info
for seqobj in SeqIO.parse(genomes_file, genomes_format):
seq_counter += 1
#Rename sequences to contain only alphanumerical charachters
new_id = seqobj.id
new_id = re.sub('\W','_',new_id)
seq_info['Original_ID'][new_id] = seqobj.id
seqobj.id = new_id
#Do not allow duplicated IDs
if (seqobj.id in seq_info['Description']):
raise Exception(f'Duplicated ID: {seqobj.id} in {genome_file}')
#Store sequence info in seq_info dict
seq_info['Description'][seqobj.id] = seqobj.description
seqobj.description = ""
seq_info['Length'][seqobj.id] = len(seqobj.seq)
seq_info['Original_File'][seqobj.id] = genomes_file
seq_info['New_File'][seqobj.id] = seqobj.id + ".fasta"
#Put each sequence in a single sequence fasta file
fasta_outfile = "Input_Genomes/" + seqobj.id + ".fasta"
with open(fasta_outfile, 'w', newline='') as OUT:
SeqIO.write(seqobj, OUT, "fasta")
print (f"Processed {seq_counter} sequences.")
file_prefix = get_prefix(genomes_file, genomes_format)
return(file_prefix)
seq_info = defaultdict(dict)
central()