Skip to content

Commit 667c2fc

Browse files
committed
v1.0.1 from webserver
1 parent 0d0872f commit 667c2fc

File tree

3 files changed

+37
-160
lines changed

3 files changed

+37
-160
lines changed

README.md

Lines changed: 5 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -65,41 +65,27 @@ At the end of the page, you find the [example calls](#sample-calls-on-test-data)
6565
### Generate kmer features with `generate_kmer_features.py`
6666

6767
```
68-
usage: generate_kmer_features.py [-h] --kmers KMERS --fasta FASTA [--threads THREADS]
69-
[--batchsize BATCHSIZE] [--report-counts]
68+
usage: generate_kmer_features.py [-h] --kmers KMERS --fasta FASTA [--report-counts]
7069
[--out-csv OUT_CSV] [--minE-subopt MINE_SUBOPT]
7170
[--minE-intarna MINE_INTARNA]
72-
[--feature-context FEATURE_CONTEXT]
7371
7472
Generate kmer-based count features based on sequence plus RNAsubopt and IntaRNA position-
75-
wise energies.. Sample calls: "python generate_kmer_features.py --kmers "AGA,GC,GGG"
76-
--fasta test.fa --out-csv counts.csv""python generate_kmer_features.py --kmers
77-
"AGA,GC,GGG" --fasta test.fa --out-csv "stdout" --report-counts --minE-subopt -5 --minE-
78-
intarna -2"
73+
wise energies..
74+
Sample calls:
75+
"python generate_kmer_features.py --kmers "AGA,GC,GGG" --fasta test.fa --out-csv counts.csv"
76+
"python generate_kmer_features.py --kmers "AGA,GC,GGG" --fasta test.fa --out-csv "stdout" --report-counts --minE-subopt -5 --minE-intarna -2"
7977
8078
optional arguments:
8179
-h, --help show this help message and exit
8280
--kmers KMERS List of kmers as a comma separated string e.g. "AGG,GA,GG"
8381
--fasta FASTA Sequences to extract features from as a FASTA file
84-
--threads THREADS Number of threads used for processing (default: 1) (WARNING:
85-
threads > 1 will impair stdout prints
86-
--batchsize BATCHSIZE
87-
If the number of processed fasta sequences is greater than batch
88-
size batch processing will be applied. This will lower memory
89-
consumption (default: 10000)
9082
--report-counts Whether to report counts as integer, default is binary
9183
nohit(0)-hit(1)
9284
--out-csv OUT_CSV CSV File name to write counts, pass "stdout" for stdout
9385
--minE-subopt MINE_SUBOPT
9486
Minimum free energy of the position on RNAsubopt result
9587
--minE-intarna MINE_INTARNA
9688
Minimum free energy of the position on IntaRNA result
97-
--feature-context FEATURE_CONTEXT
98-
feature groups (contexts) are to be generated by case-insensitive
99-
single letter a - any context (just k-mer occurrence) s -
100-
unpaired in stable intra-molecular structures h - unpaired in
101-
stable inter-molecular homo-duplex RRIs u - unpaired in both in
102-
(s) and (h)
10389
```
10490

10591
### Fit and predict ML model with `fit_predict.py`

src/fit_predict.py

Lines changed: 11 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,7 @@ def fit_dfs_trains(df_features, df_labels, scikit_model, top_features=None, vali
9191
X = df_selected_features.values
9292
#print("X", X)
9393
#print("Y", Y)
94-
n_splits = 3
94+
n_splits = 2
9595
rskf = RepeatedStratifiedKFold(n_splits=n_splits, n_repeats=10, random_state=2222)
9696
#skf = StratifiedKFold(n_splits=3, random_state=None, shuffle=True)
9797
arr_PRFS = list()#numpy.array()
@@ -109,7 +109,7 @@ def fit_dfs_trains(df_features, df_labels, scikit_model, top_features=None, vali
109109
columns=["Precision","Recall","F1"], index=['metric','std']).round(3)
110110

111111
print("10-rep {}-fold CV:".format(n_splits))
112-
print(df_metrics.to_csv(sep=','))
112+
print(df_metrics)
113113

114114
scikit_model.fit(X, y)
115115
print("Training score: {0:.3f}".format(scikit_model.score(X, y)))
@@ -167,13 +167,13 @@ def is_valid_file(file_name):
167167
model = pickle.load(inmodel)
168168
else:
169169
if args.model_choice == 'SVM-rbf':
170-
model = svm.SVC(kernel='rbf', gamma='scale', probability=True)
170+
model = svm.SVC(kernel='rbf', gamma='scale', probability=True, class_weight="balanced")
171171
if args.model_choice == 'SVM-linear':
172-
model = svm.SVC(kernel='linear', gamma='scale', probability=True)
172+
model = svm.SVC(kernel='linear', gamma='scale', probability=True, class_weight="balanced")
173173
elif args.model_choice == 'Logistic-liblinear':
174-
model = LogisticRegression(solver='liblinear')
174+
model = LogisticRegression(solver='liblinear', class_weight="balanced")
175175
elif args.model_choice == 'Logistic-lbfgs':
176-
model = LogisticRegression(solver='lbfgs')
176+
model = LogisticRegression(solver='lbfgs', class_weight="balanced")
177177

178178
if args.standardize_scaling:
179179
# build pipe: first standardize by substracting mean and dividing std
@@ -200,8 +200,11 @@ def is_valid_file(file_name):
200200
# with open(args.out_predict_labels, 'w') as outlabels:
201201
# outlabels.write('predicted_class,prob_{},prob_{}\n'.format(model.classes_[0],model.classes_[1]))
202202
# y_predict.tofile(outlabels,sep="\n")
203-
pd.DataFrame(stacked_arr, index=df_features_predict.index).to_csv(args.out_predict_labels, index=True,
204-
header=['predicted_class','prob_{}'.format(model.classes_[0]),'prob_{}'.format(model.classes_[1])],float_format='%.2F')
203+
df_out_predict = pd.DataFrame(stacked_arr, index=df_features_predict.index)
204+
df_out_predict.sort_values(by=[df_out_predict.columns[1],df_out_predict.columns[2]], ascending=False, inplace=True)
205+
df_out_predict['rank'] = df_out_predict.iloc[:,2].rank(method="first",ascending=False)
206+
df_out_predict.to_csv(args.out_predict_labels, index=True,
207+
header=['predicted_class','prob_{}'.format(model.classes_[0]),'prob_{}'.format(model.classes_[1]),'rank'],float_format='%.3F')
205208

206209
if args.save_model is True:
207210
with open(args.out_model, 'wb') as outf:

src/generate_kmer_features.py

Lines changed: 21 additions & 133 deletions
Original file line numberDiff line numberDiff line change
@@ -7,17 +7,13 @@
77
import re
88
import argparse
99
import os.path
10-
from multiprocessing import Pool
11-
from tempfile import TemporaryDirectory
12-
import pickle
13-
from typing import List
1410
import sys
1511
BINDIR = os.path.dirname(os.path.realpath(__file__))
1612
def find_kmer_hits(sequence, kmer):
1713
return [m.start() for m in re.finditer('(?='+kmer+')', sequence)] # re with look-ahead for overlaps
1814

1915
def call_command (cmd):
20-
p = subprocess.Popen(cmd,shell=True,stdin=None, stdout=PIPE, stderr=PIPE)
16+
p = subprocess.Popen(cmd,shell=True,stdin=None, stdout=PIPE)
2117
(result, error) = p.communicate()
2218
if error:
2319
raise RuntimeError("Error in calling cmd or perl script\ncmd:{}\nstdout:{}\nstderr:{}".format(cmd, result, error))
@@ -67,50 +63,6 @@ def is_valid_file(file_name):
6763
return os.path.abspath(file_name)
6864
else:
6965
raise FileNotFoundError(os.path.abspath(file_name))
70-
71-
def multicore_wrapper(seq_record, args):
72-
out_csv_str = seq_record.id
73-
print(seq_record.id)
74-
75-
seq_subopt, seq_intarna = get_subopt_intarna_strs(str(seq_record.seq),
76-
minE_subopt=args.minE_subopt,
77-
minE_intarna=args.minE_intarna)
78-
for kmer in kmers_list:
79-
hseq, hsubopt, hintarna, hsubopt_intarna = find_hits_all(str(seq_record.seq),
80-
seq_subopt,
81-
seq_intarna,
82-
kmer)
83-
cseq, csubopt, cintarna, csubopt_intarna = len(hseq), len(
84-
hsubopt), len(hintarna), len(hsubopt_intarna)
85-
array_features = []
86-
if "a" in args.feature_context.lower():
87-
array_features.append(cseq)
88-
if "s" in args.feature_context.lower():
89-
array_features.append(csubopt)
90-
if "h" in args.feature_context.lower():
91-
array_features.append(cintarna)
92-
if "u" in args.feature_context.lower():
93-
array_features.append(csubopt_intarna)
94-
95-
if args.report_counts is True:
96-
out_csv_str += ''.join([',{}'.format(f) for f in array_features])
97-
else:
98-
binary_hits = ['0' if c == 0 else '1' for c in array_features]
99-
out_csv_str += "," + ','.join(binary_hits)
100-
return out_csv_str
101-
102-
103-
def write_pickled_output(files: List[str], outfile: str, csv_header: str):
104-
with open(outfile, "w") as of:
105-
of.write(csv_header)
106-
for file in files:
107-
with open(file, "rb") as handle:
108-
data = pickle.load(handle)
109-
of.write("\n".join(data) + "\n")
110-
del data
111-
112-
113-
11466

11567
if __name__ == '__main__':
11668

@@ -120,99 +72,35 @@ def write_pickled_output(files: List[str], outfile: str, csv_header: str):
12072

12173
parser.add_argument('--kmers', required=True, type=str, help='List of kmers as a comma separated string e.g. \"AGG,GA,GG\"')
12274
parser.add_argument('--fasta', required=True, type=is_valid_file, help='Sequences to extract features from as a FASTA file')
123-
parser.add_argument('--threads', type=int, default=1, help='Number of threads used for processing (default: 1) (WARNING: threads > 1 will impair stdout prints')
124-
parser.add_argument('--batchsize', type=int, default=10000, help='If the number of processed fasta sequences is greater than batch size batch processing will be applied. This will lower memory consumption (default: 10000)')
12575
parser.add_argument('--report-counts', action='store_true', help='Whether to report counts as integer, default is binary nohit(0)-hit(1)'),
12676
parser.add_argument('--out-csv', type=str, default='stdout', help='CSV File name to write counts, pass "stdout" for stdout ')
12777
parser.add_argument('--minE-subopt', default=-3, type=int, help='Minimum free energy of the position on RNAsubopt result')
12878
parser.add_argument('--minE-intarna', default=-3, type=int, help='Minimum free energy of the position on IntaRNA result')
129-
parser.add_argument('--feature-context', default="ashu", type=str, help='feature groups (contexts) are to be generated by case-insensitive single letter'
130-
+'\n\ta - any context (just k-mer occurrence)'
131-
+'\n\ts - unpaired in stable intra-molecular structures'
132-
+'\n\th - unpaired in stable inter-molecular homo-duplex RRIs'
133-
+'\n\tu - unpaired in both in (s) and (h)')
134-
79+
13580
args = parser.parse_args()
13681
print(args)
13782
out_csv_str = "id"
13883
kmers_list = args.kmers.split(',')
13984
for kmer in kmers_list:
140-
if "a" in args.feature_context.lower():
141-
out_csv_str += ",{}_any".format(kmer)
142-
if "s" in args.feature_context.lower():
143-
out_csv_str += ",{}_intra".format(kmer)
144-
if "h" in args.feature_context.lower():
145-
out_csv_str += ",{}_dimer".format(kmer)
146-
if "u" in args.feature_context.lower():
147-
out_csv_str += ",{}_free".format(kmer)
148-
85+
out_csv_str += ",{}_any,{}_intra,{}_dimer,{}_free".format(kmer,kmer,kmer,kmer)
14986
out_csv_str += '\n'
150-
if args.threads == 1:
151-
for r in SeqIO.parse(args.fasta, format='fasta'):
152-
print(r.id)
153-
out_csv_str += r.id
154-
seq_subopt, seq_intarna = get_subopt_intarna_strs(str(r.seq), minE_subopt=args.minE_subopt, minE_intarna=args.minE_intarna)
155-
for kmer in kmers_list:
156-
hseq, hsubopt, hintarna, hsubopt_intarna = find_hits_all(str(r.seq),seq_subopt,seq_intarna, kmer)
157-
print(kmer, hseq, hsubopt, hintarna, hsubopt_intarna)
158-
cseq, csubopt, cintarna, csubopt_intarna = len(hseq), len(hsubopt), len(hintarna), len(hsubopt_intarna)
159-
array_features = []
160-
if "a" in args.feature_context.lower():
161-
array_features.append(cseq)
162-
if "s" in args.feature_context.lower():
163-
array_features.append(csubopt)
164-
if "h" in args.feature_context.lower():
165-
array_features.append(cintarna)
166-
if "u" in args.feature_context.lower():
167-
array_features.append(csubopt_intarna)
168-
169-
if args.report_counts is True:
170-
out_csv_str += ''.join([',{}'.format(f) for f in array_features])
171-
else:
172-
binary_hits = ['0' if c==0 else '1' for c in array_features]
173-
out_csv_str += ","+','.join(binary_hits)
174-
out_csv_str += '\n'
175-
176-
if args.out_csv == "stdout":
177-
print(out_csv_str)
178-
else:
179-
with open(args.out_csv, 'w') as outfile:
180-
outfile.write(out_csv_str)
181-
else:
182-
183-
calls = []
184-
for seq_record in SeqIO.parse(args.fasta, format='fasta'):
185-
calls.append((seq_record, args))
186-
187-
if args.batchsize < len(calls):
188-
tmp_dir = TemporaryDirectory(prefix="BrainDead")
189-
files = []
190-
batch_calls = [calls[x:x+args.batchsize] for x in range(0, len(calls), args.batchsize)]
191-
for x, batch in enumerate(batch_calls):
192-
with Pool(processes=args.threads) as pool:
193-
outstrings = pool.starmap(multicore_wrapper, batch)
194-
file = os.path.join(tmp_dir.name, f"batch_{x}.pckl")
195-
files.append(file)
196-
with open(file, "wb") as handle:
197-
pickle.dump(outstrings, handle)
198-
write_pickled_output(files=files,
199-
outfile=args.out_csv,
200-
csv_header=out_csv_str)
201-
202-
203-
else:
204-
with Pool(processes=args.threads) as pool:
205-
outstrings = pool.starmap(multicore_wrapper, calls)
206-
207-
out_csv_str += "\n".join(outstrings) + "\n"
208-
209-
if args.out_csv == "stdout":
210-
print(out_csv_str)
87+
for r in SeqIO.parse(args.fasta, format='fasta'):
88+
print(r.id)
89+
out_csv_str += r.id
90+
seq_subopt, seq_intarna = get_subopt_intarna_strs(str(r.seq), minE_subopt=args.minE_subopt, minE_intarna=args.minE_intarna)
91+
for kmer in kmers_list:
92+
hseq, hsubopt, hintarna, hsubopt_intarna = find_hits_all(str(r.seq),seq_subopt,seq_intarna, kmer)
93+
print(kmer, hseq, hsubopt, hintarna, hsubopt_intarna)
94+
cseq, csubopt, cintarna, csubopt_intarna = len(hseq), len(hsubopt), len(hintarna), len(hsubopt_intarna)
95+
if args.report_counts is True:
96+
out_csv_str += ',{},{},{},{}'.format(cseq, csubopt, cintarna, csubopt_intarna)
21197
else:
212-
with open(args.out_csv, 'w') as outfile:
213-
outfile.write(out_csv_str)
214-
215-
216-
217-
98+
binary_hits = ['0' if c==0 else '1' for c in [cseq, csubopt, cintarna, csubopt_intarna]]
99+
out_csv_str += ","+','.join(binary_hits)
100+
out_csv_str += '\n'
218101

102+
if args.out_csv == "stdout":
103+
print(out_csv_str)
104+
else:
105+
with open(args.out_csv, 'w') as outfile:
106+
outfile.write(out_csv_str)

0 commit comments

Comments
 (0)