ConSurf/consurf.py at main · 3BioCompBio/ConSurf · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
# Add software executables here. If they are not in your PATH, then add the full path of the executable
soft_exec_paths = {
    "phmmer":"phmmer",
    "cd-hit":"cd-hit",
    "mafft":"mafft-linsi",
    "rate4site":"rate4site"
}

Alphabet = set(['M', 'I', 'L', 'V', 'A', 'C', 'D', 'E', 'N', 'Q',
                'F', 'W', 'Y', 'G', 'P', 'S', 'T', 'H', 'K', 'R',
                'B', 'Z', 'J',
                'O', 'U', 'X'])

# Imports
import os
import subprocess
import argparse
from collections import defaultdict
from linecache import getline
from concurrent.futures import ProcessPoolExecutor

# Set (global) directory
from pathlib import Path
working_dir = Path(os.getcwd())

def PHMMER_stdout_parser(phmmer_stdout_file, E_val, query_seq, min_seq_identity, min_seq_len):
    """
    Extracts homologous sequences from a PHMMER output file generated with the --notextw parameter.
    Only homologues with an E-value <= E_val, sequence identity >= min_seq_identity and sequence length >= min_seq_len are included.
    Returns a 'phmmer.fasta' file in the current directory containing the sequences in FASTA format.

    Partly inspired by Biopython's phmmer parser: https://biopython.org/DIST/docs/api/Bio.SearchIO.HmmerIO.hmmer3_text-pysrc.html#Hmmer3TextParser.__init__
    """
    min_len = min_seq_len * len(query_seq) # ex: 0.6 * 60 = 36 => homologues must be at least 36 residues long
    output_file = 'phmmer.fasta'

    with open(output_directory_path / phmmer_stdout_file, 'rt') as file:
        phmmer_stdout_readlines = file.readlines()

    # Here all indexes correspond to line numbers in the phmmer_stdout_file
    # All lines corresponding to a new sequence/entry begin with '>>'
    sequence_index_list = [index for index, line in enumerate(phmmer_stdout_readlines) if line.strip().startswith('>>')]

    # All lines corresponding to a homologous domain of a given sequence/entry begins with '=='
    domain_index_list = [index for index, line in enumerate(phmmer_stdout_readlines) if line.strip().startswith('==')]

    # Dictionary with sequences' index and their associated domain indexes. Simpler and faster to parse the index lists in reverse
    sequence_domain_index = defaultdict(list)

    for sequence_index in reversed(sequence_index_list):
        for domain_index in reversed(domain_index_list):

            if domain_index > sequence_index:
                sequence_domain_index[sequence_index].append(domain_index)
                del domain_index_list[-1]
            else:
                break

    del sequence_index_list, domain_index_list # No longer needed

    # Write the output FASTA file
    with open(output_directory_path / output_file, 'wt') as file:

        for sequence_index, domain_index_list in sequence_domain_index.items():
            # Only parse the sequence if there is a domain (in some rare cases there is none)
            if domain_index_list:

                assert(phmmer_stdout_readlines[sequence_index + 1].strip().startswith('#'))
                assert(phmmer_stdout_readlines[sequence_index + 2].strip().startswith('---'))

                # Dictionary with i-Evalue of each domain
                domain_e_val = {}
                for i in range(len(domain_index_list)):
                    line = phmmer_stdout_readlines[sequence_index + 3 + i].split()
                    domain_number, e_val = line[0], line[5]

                    domain_e_val[domain_number] = str(e_val)

                # Select domains
                for domain_index in domain_index_list:
                    domain_number = phmmer_stdout_readlines[domain_index].split()[2]

                    alignment_len = len(phmmer_stdout_readlines[domain_index + 1].split()[-2])

                    seq_identity = len([char for char in phmmer_stdout_readlines[domain_index + 2].strip() if char not in ('+', ' ')]) / alignment_len # + and space are the characters used by HMMER to signify lack of alignment

                    homologous_id, begin_id, homologous_seq, end_id = phmmer_stdout_readlines[domain_index + 3].split()
                    homologous_seq = homologous_seq.replace('-', '').upper() # Remove indels in the target sequence to obtain original domain sequence

                    # Check validity of the domain
                    if float(domain_e_val[domain_number]) <= E_val and len(homologous_seq) >= min_len and seq_identity >= min_seq_identity:

                        header = f'> {homologous_id}_{begin_id}_{end_id} | E_val: {domain_e_val[domain_number]}\n'
                        # Transform sequence to FASTA format
                        homologous_seq = ''.join([homologous_seq[i : i + 60] + '\n' for i in range(0, len(homologous_seq), 60)])

                        file.write(header)
                        file.write(homologous_seq)

    return

def run_PHMMER(query_fasta_file_path, database_fasta_file_path,
               min_seq_identity = 0.35, min_seq_len = 0.60, E_val = 0.0001,
               N_CPU = 2):
    """
    Run phmmer with the query against the given database (in FASTA format both) with the given parameter settings.
    Returns two files:
     - Output file generated by phmmer (phmmer_stdout.txt)
     - A FASTA file with all the homologous sequences that fulfill the required parameters (phmmer.fasta).
    """
    # Check input FASTA file and extract sequence
    query_seq = str()
    with open(query_fasta_file_path, 'rt') as f:
        for index, line in enumerate(f):

            if index == 0:
                assert line.startswith('>'), "Incorrect header format, first line must begin with '>'"
            else:
                assert all(char in Alphabet for char in line.strip()), "Unknown character found in the query's FASTA sequence"
                query_seq += line.strip()

    # Run. Usage: phmmer [-options] <seqfile> <seqdb>
    output_file = 'phmmer_stdout.txt'

    cmd = [soft_exec_paths['phmmer'], '-o', str(output_directory_path / output_file), '--notextw', '--cpu', str(N_CPU), '--domE', str(E_val),
            '--incE', str(E_val), '-E', str(E_val), str(query_fasta_file_path), str(database_fasta_file_path)]
    subprocess.run(cmd, check=True)

    # Extract homologues at given E-value, minimal sequence identity and minimum sequence length
    PHMMER_stdout_parser(phmmer_stdout_file = output_file,
                         E_val = E_val,
                         query_seq = query_seq,
                         min_seq_identity = min_seq_identity, min_seq_len = min_seq_len)

    # Make sure file is not empty (at least 1 homologous sequence)
    if os.stat(output_directory_path / output_file).st_size == 0:
        raise ValueError("No homologous sequences where found in the database with the given parameters.")

    return

def run_CDHIT(query_fasta_file_path, clustering_seq_identity = 0.95, max_homol = 150):
    """
    """
    input_file = 'phmmer.fasta'
    output_file = 'selected_homologues.fasta'

    # Run. Usage: cd-hit [Options]
    cmd = [soft_exec_paths['cd-hit'], '-i', str(output_directory_path / input_file), '-o', str(output_directory_path / output_file),
           '-c', str(clustering_seq_identity)]
    with open(str(output_directory_path / 'cdhit.log'), 'wt') as stdout_file_path:
        subprocess.run(cmd, check=True, stdout=stdout_file_path)

    # Check the number of homologoues sequences left after clustering by reading the cd-hit log file:
    # - If there are less than 5 -> throw error
    # - If there are between 5-10 -> print warning
    with open(output_directory_path / 'cdhit.log', 'rt') as file:
        readlines = file.readlines()

    # The line with the n° of clusters is written in the format 'n° ... finished ... n° ... clusters'
    for index, line in enumerate(readlines):
        line = line.split()
        if all(keyword in line for keyword in ('finished', 'clusters')):
            n_clusters = int(line[-2])
            break
    else:
        raise ValueError("Could not determine the n° of clusters generated by cd-hit")

    if n_clusters < 5:
        raise ValueError(f'Only {n_clusters} homologues were found which is insufficient (need at least 5)')

    if 5 <= n_clusters <= 10:
        print('Only {n_clusters} homologues were found, calculations continue nonetheless')

    # If there are more than 150 homologous sequences after clustering:
    # - Sort homologues by increasing E-value
    # - Sample 150 homologoues sequences
    # - Overwrite output file with the selected sequences
    if n_clusters > max_homol:

        with open(output_directory_path / output_file, 'rt') as file:
            readlines = file.readlines()

        indexes = [index for index, line in enumerate(readlines) if line.startswith('>')]

        # Sort. Simpler and faster to use reversed index
        sorted_homologues = []
        for index in reversed(indexes):

            E_val = float(readlines[index].split()[-1])
            homologue_data = ''.join(readlines[index:]) # Includes header and sequence

            sorted_homologues.append((E_val, homologue_data))

            del readlines[index:]

        sorted_homologues.sort() # Sorts by increasing value by default

        # Sample
        selected_indexes = [int(i * (n_clusters/max_homol)) for i in range(max_homol)] # Equivalent of (int(i) for i in range(0, n_clusters, n_clusters/150)), but range doesn't allow float steps so use this trick instead. int(i) always rounds down (ex: int(1.9) = 1)
        selected_homologues = [sorted_homologues[index][1] for index in selected_indexes]

        # Overwrite
        with open(output_directory_path / output_file, 'wt') as file:
            for homologue in selected_homologues:
                file.write(homologue) # Header and sequence


    # Rewrite the output file so that the query is the first sequence in the file (MAFFT considers the first sequence as the reference to build the alignment)
    with open(query_fasta_file_path, 'rt') as file:
        readlines = file.readlines()
    with open(output_directory_path / output_file, 'rt') as file:
        readlines += file.readlines()

    with open(output_directory_path / output_file, 'wt') as file:
        for line in readlines:
            file.write(line)

    # Remove .clstr file generated by cd-hit
    os.remove(output_directory_path / (output_file + '.clstr'))

    return

def run_MAFFT(N_CPU = 2):
    """
    """
    input_file = 'selected_homologues.fasta'
    output_file = 'query_msa.aln'

    # Run. Usage: mafft [Options] in > out
    cmd = [soft_exec_paths['mafft'], '--thread', str(N_CPU), '--quiet', str(output_directory_path / input_file)]
    with open(str(output_directory_path / output_file), 'wt') as stdout_file_path:
        subprocess.run(cmd, check=True, stdout=stdout_file_path)

    return

def run_evolutionary_model(model, input_file):
    """
    """
    input_file_path             = str(output_directory_path / input_file)
    normalized_grades_file_path = str(output_directory_path / f'{model}_normalized_grades.txt')
    original_grades_file_path   = str(output_directory_path / f'{model}_original_grades.txt')
    tree_file_path              = str(output_directory_path / f'{model}_tree.txt')

    # Run. Usage: rate4site [-options]
    cmd = [soft_exec_paths['rate4site'], f'-{model}', '-s', input_file_path, '-o', normalized_grades_file_path, '-y', original_grades_file_path,
           '-x', tree_file_path]
    with open(os.devnull, 'wt') as stdout_file_path:
        subprocess.run(cmd, check=True, stdout=stdout_file_path) # Still does not remove some of rate4site's print messages to console

    # Get the negative log likelihood (-lnl) of the evolutionary model
    line = getline(filename = normalized_grades_file_path, lineno = 13)
    if not line.startswith('#LL='): # Just to make sure
        raise ValueError(f'Could not read the rate4site output file correctly, check {normalized_grades_file_path}')

    likelihood = float(line.split('=')[-1])

    return likelihood

def run_rate4site(evolutionary_model = 'best', N_CPU = 2):
    """
    """
    input_file = 'query_msa.aln'

    evolutionary_models_dico = {'best':['Mj', 'Md', 'Mw', 'Ml'],
                                'JTT':['Mj'], 'Dayhoff':['Md'], 'WAG':['Mw'], 'LG':['Ml'],
                                'mitochondrial':['mtREV'],
                                'chloroplast':['cpREV']}

    selected_models = evolutionary_models_dico[evolutionary_model]

    # Run the models and select the one with maximum likelihood if there are multiple models to evaluate
    models_likelihood = {}
    with ProcessPoolExecutor(max_workers = N_CPU) as executor:
        for model, likelihood in zip(selected_models, executor.map(run_evolutionary_model, selected_models, [input_file]*len(selected_models))):
            models_likelihood[model] = likelihood

    best_model = max(models_likelihood, key=models_likelihood.get)

    evolutionary_models_translation = {'Mj':'JTT', 'Md':'Dayhoff', 'Mw':'WAG', 'Ml':'LG',
                                       'mtREV':'mitochondrial(mtREV)', 'cpREV':'chloroplast(cpREV)'}

    print(f'Selected model: {evolutionary_models_translation[best_model]}')


    # Remove files of unwanted models and rename the files of the selected model (remove prefix)
    for model in models_likelihood:
        if model != best_model:
            os.remove(output_directory_path / f'{model}_original_grades.txt')
            os.remove(output_directory_path / f'{model}_normalized_grades.txt')
            os.remove(output_directory_path / f'{model}_tree.txt')

    os.rename(src = output_directory_path / f'{best_model}_original_grades.txt',   dst = output_directory_path / f'original_grades.txt')
    os.rename(src = output_directory_path / f'{best_model}_normalized_grades.txt', dst = output_directory_path / f'normalized_grades.txt')
    os.rename(src = output_directory_path / f'{best_model}_tree.txt',              dst = output_directory_path / f'tree.txt')

    # A 'r4s.res' empty file is generated by rate4site (don't know why)
    if os.path.isfile(working_dir / 'r4s.res') and os.stat(working_dir / 'r4s.res').st_size == 0:
        os.remove(working_dir / 'r4s.res')

    return


def check_arguments(args):
    """
    Check validity of parameters/arguments given in the command-line.
    """
    if not os.path.isfile(args.input_file):
        raise argparse.ArgumentTypeError(f"The input file path is invalid ({args.input_file})")

    if not os.path.isfile(args.database):
        raise argparse.ArgumentTypeError(f"The database file path is invalid ({args.database})")

    if not args.e_val > 0:
        raise argparse.ArgumentTypeError("E val must be > 0")

    if not 0 <= args.min_seq_identity <= 1:
        raise argparse.ArgumentTypeError("Minimal global sequence identity must be between 0-1")

    if not 0 <= args.min_seq_len <= 1:
        raise argparse.ArgumentTypeError("Minimal relative length must be between 0-1")

    if not 1 <= args.max_homol <= 199:
        raise argparse.ArgumentTypeError("The number of homologues must be between 1-199")

    return

def argument_parser():
    """
    """
    parser = argparse.ArgumentParser(description = "Description:\n  Python implementation of the ConSurf pipeline for the estimation of residue's evolutionary conservation.",
                                     usage = 'consurf <input_file> [parameters]',
                                     epilog = "Results are saved in a folder in the current directory\n ",
                                     formatter_class = argparse.RawTextHelpFormatter,
                                     add_help = False)

    parser._positionals.title = 'Input Data'
    parser._optionals.title = 'Parameters'

    # Input data
    parser.add_argument('input_file', type = str,
                        help = 'Full path of a FASTA file containing the query sequence (1 chain only)')

    parser.add_argument('-database', type = str, metavar = '',
                        help = "Full path of a FASTA formated database that is used to search for homologous sequences\n ")
    # Parameters
    parser.add_argument('-model', type = str, choices = ['best', 'JTT', 'Dayhoff', 'WAG', 'LG', 'mitochondrial', 'chloroplast'], default = 'best', metavar = '',
                        help = "Options: best(default), JTT, Dayhoff, WAG, LG, mitochondrial, chloroplast\nEvolutionary substitution model to be used by the rate4site program. By default it will\ndetermine which of JTT/Dayhoff/WAG/LG fits best with the analyzed sequences.\n ")

    parser.add_argument('-e_val', type = float, default = 0.0001, metavar = '',
                        help = "Default: 0.0001\nMinimal E-value of homologous sequences selected by phmmer\n ")

    parser.add_argument('-min_seq_identity', type = float, default = 0.35, metavar = '',
                        help = 'Default: 0.35\nMinimal global sequence identity of homologous sequences w.r.t the input query sequence\n ')

    parser.add_argument('-min_seq_len', type = float, default = 0.60, metavar = '',
                        help = 'Default: 0.60\nMinimal relative length of homologous sequence w.r.t the input query sequence\n ')

    parser.add_argument('-clust_seq_identity', type = float, default = 0.95, metavar = '',
                        help = 'Default: 0.95\nSequence identity cutoff used by cd-hit to cluster similar homologues and select a\nrepresentative sequence (avoids having redundant homologues)\n ')

    parser.add_argument('-max_homol', type = int, default = 150, metavar = '',
                        help = 'Default: 150\nMaximum number of homologues to use. If their number exceeds the limit then they are sorted\nby their E value and selected at a regular interval (ex: if there are 600 homologues in\ntotal, then homolgues 1,5,9,... will be selected)\n ')

    parser.add_argument('-n_cpu', type = int, choices = [1, 2], default = 1, metavar = '', ### Adapt choices for github release
                        help = "Options: 1(default) or 2\nNumber of cores\n ")

    parser.add_argument('-h', '--help', action = 'help', default= argparse.SUPPRESS,
                        help = "Show this help message and exit\n ")

    return parser

def main(query_fasta_file_path, database_fasta_file_path,
         E_val = 0.0001, min_seq_identity = 0.35, min_seq_len = 0.60,
         clustering_seq_identity = 0.95, max_homol = 150,
         evolutionary_model = 'best',
         N_CPU = 2):
    """
    """
    # Create output directory where all generated files will be saved. If a directory with the same name already exists
    # then create one with a '_n°' extension
    query_id = os.path.split(query_fasta_file_path)[-1] # Tail of path
    query_id = query_id.rsplit('.', maxsplit = 1)[0] if '.' in query_id else query_id

    database_id = os.path.split(database_fasta_file_path)[-1]
    database_id = database_id.rsplit('.', maxsplit = 1)[0] if '.' in database_id else database_id

    global output_directory_path # Avoids having to add the directory as as parameter in each function
    output_directory_path = working_dir / f'{query_id}_{database_id}'

    counter = 1
    while os.path.isdir(output_directory_path):
        output_directory_path = str(output_directory_path).replace(f'_{str(counter - 1)}', '') # Removes extension if there is one
        output_directory_path = Path(f'{output_directory_path}_{str(counter)}')

        counter += 1

    os.mkdir(output_directory_path)

    # Run pipeline
    print('Finding homologous sequences ...')
    run_PHMMER(query_fasta_file_path = query_fasta_file_path, database_fasta_file_path = database_fasta_file_path,
               E_val = E_val, min_seq_identity = min_seq_identity, min_seq_len = min_seq_len, N_CPU = N_CPU)

    print('Clustering sequences to remove redundancy ...')
    run_CDHIT(query_fasta_file_path = query_fasta_file_path, clustering_seq_identity = clustering_seq_identity,
              max_homol = max_homol)

    print('Building multiple sequence alignement ...')
    run_MAFFT(N_CPU = N_CPU)

    print('Calculating conservation scores ...')
    run_rate4site(evolutionary_model = evolutionary_model, N_CPU = N_CPU)

    print(f'Consurf run was successful. Results are in {output_directory_path}.')

    return

if __name__ == '__main__':
    # Parse and validate command-line arguments
    parser = argument_parser()
    args = parser.parse_args()

    check_arguments(args)

    # Run
    main(query_fasta_file_path = args.input_file, database_fasta_file_path = args.database,
         E_val = args.e_val, min_seq_identity = args.min_seq_identity, min_seq_len = args.min_seq_len,
         clustering_seq_identity = args.clust_seq_identity, max_homol = args.max_homol,
         evolutionary_model = args.model,
         N_CPU = args.n_cpu)

# Allow for pre-built MSA input ?
# Write log file in case of errors ?
# Perform mapping to PDB ATOM records ?