7
7
import re
8
8
import argparse
9
9
import os .path
10
- from multiprocessing import Pool
11
- from tempfile import TemporaryDirectory
12
- import pickle
13
- from typing import List
14
10
import sys
15
11
BINDIR = os .path .dirname (os .path .realpath (__file__ ))
16
12
def find_kmer_hits (sequence , kmer ):
17
13
return [m .start () for m in re .finditer ('(?=' + kmer + ')' , sequence )] # re with look-ahead for overlaps
18
14
19
15
def call_command (cmd ):
20
- p = subprocess .Popen (cmd ,shell = True ,stdin = None , stdout = PIPE , stderr = PIPE )
16
+ p = subprocess .Popen (cmd ,shell = True ,stdin = None , stdout = PIPE )
21
17
(result , error ) = p .communicate ()
22
18
if error :
23
19
raise RuntimeError ("Error in calling cmd or perl script\n cmd:{}\n stdout:{}\n stderr:{}" .format (cmd , result , error ))
@@ -67,50 +63,6 @@ def is_valid_file(file_name):
67
63
return os .path .abspath (file_name )
68
64
else :
69
65
raise FileNotFoundError (os .path .abspath (file_name ))
70
-
71
- def multicore_wrapper (seq_record , args ):
72
- out_csv_str = seq_record .id
73
- print (seq_record .id )
74
-
75
- seq_subopt , seq_intarna = get_subopt_intarna_strs (str (seq_record .seq ),
76
- minE_subopt = args .minE_subopt ,
77
- minE_intarna = args .minE_intarna )
78
- for kmer in kmers_list :
79
- hseq , hsubopt , hintarna , hsubopt_intarna = find_hits_all (str (seq_record .seq ),
80
- seq_subopt ,
81
- seq_intarna ,
82
- kmer )
83
- cseq , csubopt , cintarna , csubopt_intarna = len (hseq ), len (
84
- hsubopt ), len (hintarna ), len (hsubopt_intarna )
85
- array_features = []
86
- if "a" in args .feature_context .lower ():
87
- array_features .append (cseq )
88
- if "s" in args .feature_context .lower ():
89
- array_features .append (csubopt )
90
- if "h" in args .feature_context .lower ():
91
- array_features .append (cintarna )
92
- if "u" in args .feature_context .lower ():
93
- array_features .append (csubopt_intarna )
94
-
95
- if args .report_counts is True :
96
- out_csv_str += '' .join ([',{}' .format (f ) for f in array_features ])
97
- else :
98
- binary_hits = ['0' if c == 0 else '1' for c in array_features ]
99
- out_csv_str += "," + ',' .join (binary_hits )
100
- return out_csv_str
101
-
102
-
103
- def write_pickled_output (files : List [str ], outfile : str , csv_header : str ):
104
- with open (outfile , "w" ) as of :
105
- of .write (csv_header )
106
- for file in files :
107
- with open (file , "rb" ) as handle :
108
- data = pickle .load (handle )
109
- of .write ("\n " .join (data ) + "\n " )
110
- del data
111
-
112
-
113
-
114
66
115
67
if __name__ == '__main__' :
116
68
@@ -120,99 +72,35 @@ def write_pickled_output(files: List[str], outfile: str, csv_header: str):
120
72
121
73
parser .add_argument ('--kmers' , required = True , type = str , help = 'List of kmers as a comma separated string e.g. \" AGG,GA,GG\" ' )
122
74
parser .add_argument ('--fasta' , required = True , type = is_valid_file , help = 'Sequences to extract features from as a FASTA file' )
123
- parser .add_argument ('--threads' , type = int , default = 1 , help = 'Number of threads used for processing (default: 1) (WARNING: threads > 1 will impair stdout prints' )
124
- parser .add_argument ('--batchsize' , type = int , default = 10000 , help = 'If the number of processed fasta sequences is greater than batch size batch processing will be applied. This will lower memory consumption (default: 10000)' )
125
75
parser .add_argument ('--report-counts' , action = 'store_true' , help = 'Whether to report counts as integer, default is binary nohit(0)-hit(1)' ),
126
76
parser .add_argument ('--out-csv' , type = str , default = 'stdout' , help = 'CSV File name to write counts, pass "stdout" for stdout ' )
127
77
parser .add_argument ('--minE-subopt' , default = - 3 , type = int , help = 'Minimum free energy of the position on RNAsubopt result' )
128
78
parser .add_argument ('--minE-intarna' , default = - 3 , type = int , help = 'Minimum free energy of the position on IntaRNA result' )
129
- parser .add_argument ('--feature-context' , default = "ashu" , type = str , help = 'feature groups (contexts) are to be generated by case-insensitive single letter'
130
- + '\n \t a - any context (just k-mer occurrence)'
131
- + '\n \t s - unpaired in stable intra-molecular structures'
132
- + '\n \t h - unpaired in stable inter-molecular homo-duplex RRIs'
133
- + '\n \t u - unpaired in both in (s) and (h)' )
134
-
79
+
135
80
args = parser .parse_args ()
136
81
print (args )
137
82
out_csv_str = "id"
138
83
kmers_list = args .kmers .split (',' )
139
84
for kmer in kmers_list :
140
- if "a" in args .feature_context .lower ():
141
- out_csv_str += ",{}_any" .format (kmer )
142
- if "s" in args .feature_context .lower ():
143
- out_csv_str += ",{}_intra" .format (kmer )
144
- if "h" in args .feature_context .lower ():
145
- out_csv_str += ",{}_dimer" .format (kmer )
146
- if "u" in args .feature_context .lower ():
147
- out_csv_str += ",{}_free" .format (kmer )
148
-
85
+ out_csv_str += ",{}_any,{}_intra,{}_dimer,{}_free" .format (kmer ,kmer ,kmer ,kmer )
149
86
out_csv_str += '\n '
150
- if args .threads == 1 :
151
- for r in SeqIO .parse (args .fasta , format = 'fasta' ):
152
- print (r .id )
153
- out_csv_str += r .id
154
- seq_subopt , seq_intarna = get_subopt_intarna_strs (str (r .seq ), minE_subopt = args .minE_subopt , minE_intarna = args .minE_intarna )
155
- for kmer in kmers_list :
156
- hseq , hsubopt , hintarna , hsubopt_intarna = find_hits_all (str (r .seq ),seq_subopt ,seq_intarna , kmer )
157
- print (kmer , hseq , hsubopt , hintarna , hsubopt_intarna )
158
- cseq , csubopt , cintarna , csubopt_intarna = len (hseq ), len (hsubopt ), len (hintarna ), len (hsubopt_intarna )
159
- array_features = []
160
- if "a" in args .feature_context .lower ():
161
- array_features .append (cseq )
162
- if "s" in args .feature_context .lower ():
163
- array_features .append (csubopt )
164
- if "h" in args .feature_context .lower ():
165
- array_features .append (cintarna )
166
- if "u" in args .feature_context .lower ():
167
- array_features .append (csubopt_intarna )
168
-
169
- if args .report_counts is True :
170
- out_csv_str += '' .join ([',{}' .format (f ) for f in array_features ])
171
- else :
172
- binary_hits = ['0' if c == 0 else '1' for c in array_features ]
173
- out_csv_str += "," + ',' .join (binary_hits )
174
- out_csv_str += '\n '
175
-
176
- if args .out_csv == "stdout" :
177
- print (out_csv_str )
178
- else :
179
- with open (args .out_csv , 'w' ) as outfile :
180
- outfile .write (out_csv_str )
181
- else :
182
-
183
- calls = []
184
- for seq_record in SeqIO .parse (args .fasta , format = 'fasta' ):
185
- calls .append ((seq_record , args ))
186
-
187
- if args .batchsize < len (calls ):
188
- tmp_dir = TemporaryDirectory (prefix = "BrainDead" )
189
- files = []
190
- batch_calls = [calls [x :x + args .batchsize ] for x in range (0 , len (calls ), args .batchsize )]
191
- for x , batch in enumerate (batch_calls ):
192
- with Pool (processes = args .threads ) as pool :
193
- outstrings = pool .starmap (multicore_wrapper , batch )
194
- file = os .path .join (tmp_dir .name , f"batch_{ x } .pckl" )
195
- files .append (file )
196
- with open (file , "wb" ) as handle :
197
- pickle .dump (outstrings , handle )
198
- write_pickled_output (files = files ,
199
- outfile = args .out_csv ,
200
- csv_header = out_csv_str )
201
-
202
-
203
- else :
204
- with Pool (processes = args .threads ) as pool :
205
- outstrings = pool .starmap (multicore_wrapper , calls )
206
-
207
- out_csv_str += "\n " .join (outstrings ) + "\n "
208
-
209
- if args .out_csv == "stdout" :
210
- print (out_csv_str )
87
+ for r in SeqIO .parse (args .fasta , format = 'fasta' ):
88
+ print (r .id )
89
+ out_csv_str += r .id
90
+ seq_subopt , seq_intarna = get_subopt_intarna_strs (str (r .seq ), minE_subopt = args .minE_subopt , minE_intarna = args .minE_intarna )
91
+ for kmer in kmers_list :
92
+ hseq , hsubopt , hintarna , hsubopt_intarna = find_hits_all (str (r .seq ),seq_subopt ,seq_intarna , kmer )
93
+ print (kmer , hseq , hsubopt , hintarna , hsubopt_intarna )
94
+ cseq , csubopt , cintarna , csubopt_intarna = len (hseq ), len (hsubopt ), len (hintarna ), len (hsubopt_intarna )
95
+ if args .report_counts is True :
96
+ out_csv_str += ',{},{},{},{}' .format (cseq , csubopt , cintarna , csubopt_intarna )
211
97
else :
212
- with open (args .out_csv , 'w' ) as outfile :
213
- outfile .write (out_csv_str )
214
-
215
-
216
-
217
-
98
+ binary_hits = ['0' if c == 0 else '1' for c in [cseq , csubopt , cintarna , csubopt_intarna ]]
99
+ out_csv_str += "," + ',' .join (binary_hits )
100
+ out_csv_str += '\n '
218
101
102
+ if args .out_csv == "stdout" :
103
+ print (out_csv_str )
104
+ else :
105
+ with open (args .out_csv , 'w' ) as outfile :
106
+ outfile .write (out_csv_str )
0 commit comments