-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathprep_files_for_new_db.py
More file actions
115 lines (103 loc) · 3.23 KB
/
prep_files_for_new_db.py
File metadata and controls
115 lines (103 loc) · 3.23 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
import sys
import os
import glob
from Bio import SeqIO
import shutil
import argparse
parser = argparse.ArgumentParser(description='Prepares files for new database creation from subset of main database taxa')
required = parser.add_argument_group('required arguments')
required.add_argument('-t', '--taxa_list', type = str, help = 'List of taxa as unique IDs to include in new database', required=True)
required.add_argument('-d', '--master_db', type = str, help = 'Path to master phylofisher database', required = True)
required.add_argument('-o', '--out_dir', type = str, help = 'Path to location where output directory for new database files will be made', required = True)
args=parser.parse_args()
## Open Taxa list for new DB
infile = open(args.taxa_list)
lines = infile.read()
infile.close
lines = lines.split('\n')
lines=[line for line in lines if line.strip() !=""]
# get masterDB path
masterdbpath = args.master_db
#Make output directories
outdir = args.out_dir + "/new_database"
orthooutdir = outdir + "/orthologs"
paraoutdir = outdir + "/paralogs"
protoutdir = outdir + "/proteomes"
try:
os.mkdir(outdir)
print(outdir + " created")
except OSError as error:
print(outdir + " already exists")
pass
try:
os.mkdir(orthooutdir)
print(orthooutdir + " created")
except OSError as error:
print(orthooutdir + " already exists")
pass
try:
os.mkdir(paraoutdir)
print(paraoutdir + " created")
except OSError as error:
print(paraoutdir + " already exists")
pass
try:
os.mkdir(protoutdir)
print(protoutdir + " created")
except OSError as error:
print(protoutdir + " already exists")
pass
## Make new ortholog fastas from selected taxa
orthopath = masterdbpath + "/orthologs/"
orthofastas = (glob.glob(orthopath +"/*.fas"))
for i in orthofastas:
keep = []
fasta = SeqIO.parse(open(i), 'fasta')
fname = i.split('/')[-1]
print(fname)
with open(orthooutdir + "/" + fname, "w") as outfasta:
for record in fasta:
for line in lines:
if line in record.id:
keep.append(record)
SeqIO.write(record, outfasta, "fasta")
else:
pass
## Make new paralog fastas from selected taxa
parapath = masterdbpath + "/paralogs/"
parafastas = (glob.glob(parapath +"/*.fas"))
for i in parafastas:
keep = []
fasta = SeqIO.parse(open(i), 'fasta')
fname = i.split('/')[-1]
print(fname)
with open(paraoutdir + "/" + fname, "w") as outfasta:
for record in fasta:
for line in lines:
if line in record.id:
keep.append(record)
SeqIO.write(record, outfasta, "fasta")
else:
pass
## Copy proteome files to new directory
protpath = masterdbpath + "/proteomes/"
proteomes = (glob.glob(protpath +"/*.gz"))
for line in lines:
shutil.copy(protpath + line + ".faa.tar.gz", protoutdir, follow_symlinks=True)
print(protpath + line + ".faa.tar.gz")
## Make new metadata file
metain = open(masterdbpath + "/metadata.tsv")
mlines = metain.read()
metain.close
mlines = mlines.split('\n')
mlines=[line for line in mlines if line.strip() !=""]
newmd = ['Unique ID\tLong Name\tHigher Taxonomy\tLower Taxonomy\tData Type\tSource']
for mline in mlines:
uniqid = mline.split('\t')[0]
for line in lines:
if line == uniqid:
newmd.append(mline)
with open(outdir + '/metadata.tsv', 'w') as metaout:
for line in newmd:
metaout.write(line + '\n')
metaout.close