v1.0.1 from webserver

martin-raden · martin-raden · commit 667c2fc912df · 2024-12-16T17:00:03.000+01:00
diff --git a/README.md b/README.md
@@ -65,41 +65,27 @@ At the end of the page, you find the [example calls](#sample-calls-on-test-data)
 ### Generate kmer features with `generate_kmer_features.py`
 
 ```
-usage: generate_kmer_features.py [-h] --kmers KMERS --fasta FASTA [--threads THREADS]
-                                 [--batchsize BATCHSIZE] [--report-counts]
+usage: generate_kmer_features.py [-h] --kmers KMERS --fasta FASTA [--report-counts]
                                  [--out-csv OUT_CSV] [--minE-subopt MINE_SUBOPT]
                                  [--minE-intarna MINE_INTARNA]
-                                 [--feature-context FEATURE_CONTEXT]
 
 Generate kmer-based count features based on sequence plus RNAsubopt and IntaRNA position-
-wise energies.. Sample calls: "python generate_kmer_features.py --kmers "AGA,GC,GGG"
---fasta test.fa --out-csv counts.csv""python generate_kmer_features.py --kmers
-"AGA,GC,GGG" --fasta test.fa --out-csv "stdout" --report-counts --minE-subopt -5 --minE-
-intarna -2"
+wise energies.. 
+Sample calls: 
+"python generate_kmer_features.py --kmers "AGA,GC,GGG" --fasta test.fa --out-csv counts.csv" 
+"python generate_kmer_features.py --kmers "AGA,GC,GGG" --fasta test.fa --out-csv "stdout" --report-counts --minE-subopt -5 --minE-intarna -2"
 
 optional arguments:
   -h, --help            show this help message and exit
   --kmers KMERS         List of kmers as a comma separated string e.g. "AGG,GA,GG"
   --fasta FASTA         Sequences to extract features from as a FASTA file
-  --threads THREADS     Number of threads used for processing (default: 1) (WARNING:
-                        threads > 1 will impair stdout prints
-  --batchsize BATCHSIZE
-                        If the number of processed fasta sequences is greater than batch
-                        size batch processing will be applied. This will lower memory
-                        consumption (default: 10000)
   --report-counts       Whether to report counts as integer, default is binary
                         nohit(0)-hit(1)
   --out-csv OUT_CSV     CSV File name to write counts, pass "stdout" for stdout
   --minE-subopt MINE_SUBOPT
                         Minimum free energy of the position on RNAsubopt result
   --minE-intarna MINE_INTARNA
                         Minimum free energy of the position on IntaRNA result
-  --feature-context FEATURE_CONTEXT
-                        feature groups (contexts) are to be generated by case-insensitive
-                        single letter a - any context (just k-mer occurrence) s -
-                        unpaired in stable intra-molecular structures h - unpaired in
-                        stable inter-molecular homo-duplex RRIs u - unpaired in both in
-                        (s) and (h)
 ```
 
 ### Fit and predict ML model with `fit_predict.py`
diff --git a/src/fit_predict.py b/src/fit_predict.py
@@ -91,7 +91,7 @@ def fit_dfs_trains(df_features, df_labels, scikit_model, top_features=None, vali
     X = df_selected_features.values
     #print("X", X)
     #print("Y", Y)
-    n_splits = 3
+    n_splits = 2
     rskf = RepeatedStratifiedKFold(n_splits=n_splits, n_repeats=10, random_state=2222)
     #skf = StratifiedKFold(n_splits=3, random_state=None, shuffle=True)
     arr_PRFS = list()#numpy.array()
@@ -109,7 +109,7 @@ def fit_dfs_trains(df_features, df_labels, scikit_model, top_features=None, vali
     columns=["Precision","Recall","F1"], index=['metric','std']).round(3)
 
     print("10-rep {}-fold CV:".format(n_splits))
-    print(df_metrics.to_csv(sep=','))
+    print(df_metrics)
     
     scikit_model.fit(X, y)
     print("Training score: {0:.3f}".format(scikit_model.score(X, y)))
@@ -167,13 +167,13 @@ def is_valid_file(file_name):
             model = pickle.load(inmodel)
     else:
         if args.model_choice == 'SVM-rbf':
-            model = svm.SVC(kernel='rbf', gamma='scale', probability=True)
+            model = svm.SVC(kernel='rbf', gamma='scale', probability=True, class_weight="balanced")
         if args.model_choice == 'SVM-linear':
-            model = svm.SVC(kernel='linear', gamma='scale', probability=True)
+            model = svm.SVC(kernel='linear', gamma='scale', probability=True, class_weight="balanced")
         elif args.model_choice == 'Logistic-liblinear':
-            model = LogisticRegression(solver='liblinear')    
+            model = LogisticRegression(solver='liblinear', class_weight="balanced")    
         elif args.model_choice == 'Logistic-lbfgs':
-            model = LogisticRegression(solver='lbfgs')    
+            model = LogisticRegression(solver='lbfgs', class_weight="balanced")    
 
         if args.standardize_scaling:
             # build pipe: first standardize by substracting mean and dividing std
@@ -200,8 +200,11 @@ def is_valid_file(file_name):
         # with open(args.out_predict_labels, 'w') as outlabels:
         #     outlabels.write('predicted_class,prob_{},prob_{}\n'.format(model.classes_[0],model.classes_[1]))
         #     y_predict.tofile(outlabels,sep="\n")
-        pd.DataFrame(stacked_arr, index=df_features_predict.index).to_csv(args.out_predict_labels, index=True,
-        header=['predicted_class','prob_{}'.format(model.classes_[0]),'prob_{}'.format(model.classes_[1])],float_format='%.2F')
+        df_out_predict = pd.DataFrame(stacked_arr, index=df_features_predict.index)
+        df_out_predict.sort_values(by=[df_out_predict.columns[1],df_out_predict.columns[2]], ascending=False, inplace=True)
+        df_out_predict['rank'] = df_out_predict.iloc[:,2].rank(method="first",ascending=False)
+        df_out_predict.to_csv(args.out_predict_labels, index=True,
+        header=['predicted_class','prob_{}'.format(model.classes_[0]),'prob_{}'.format(model.classes_[1]),'rank'],float_format='%.3F')
 
     if args.save_model is True:
         with open(args.out_model, 'wb') as outf:
diff --git a/src/generate_kmer_features.py b/src/generate_kmer_features.py
@@ -7,17 +7,13 @@
 import re
 import argparse
 import os.path
-from multiprocessing import Pool
-from tempfile import TemporaryDirectory
-import pickle
-from typing import List
 import sys
 BINDIR = os.path.dirname(os.path.realpath(__file__)) 
 def find_kmer_hits(sequence, kmer):
     return [m.start() for m in re.finditer('(?='+kmer+')', sequence)] # re with look-ahead for overlaps
 
 def call_command (cmd):
-    p = subprocess.Popen(cmd,shell=True,stdin=None, stdout=PIPE, stderr=PIPE)
+    p = subprocess.Popen(cmd,shell=True,stdin=None, stdout=PIPE)
     (result, error) = p.communicate()
     if error:
         raise RuntimeError("Error in calling cmd or perl script\ncmd:{}\nstdout:{}\nstderr:{}".format(cmd, result, error))
@@ -67,50 +63,6 @@ def is_valid_file(file_name):
         return os.path.abspath(file_name)
     else:
         raise FileNotFoundError(os.path.abspath(file_name))
-
-def multicore_wrapper(seq_record, args):
-    out_csv_str = seq_record.id
-    print(seq_record.id)
-
-    seq_subopt, seq_intarna = get_subopt_intarna_strs(str(seq_record.seq),
-                                                      minE_subopt=args.minE_subopt,
-                                                      minE_intarna=args.minE_intarna)
-    for kmer in kmers_list:
-        hseq, hsubopt, hintarna, hsubopt_intarna = find_hits_all(str(seq_record.seq),
-                                                                 seq_subopt,
-                                                                 seq_intarna,
-                                                                 kmer)
-        cseq, csubopt, cintarna, csubopt_intarna = len(hseq), len(
-            hsubopt), len(hintarna), len(hsubopt_intarna)
-        array_features = []
-        if "a" in args.feature_context.lower():
-            array_features.append(cseq)
-        if "s" in args.feature_context.lower():
-            array_features.append(csubopt)
-        if "h" in args.feature_context.lower():
-            array_features.append(cintarna)
-        if "u" in args.feature_context.lower():
-            array_features.append(csubopt_intarna)
-
-        if args.report_counts is True:
-            out_csv_str += ''.join([',{}'.format(f) for f in array_features])
-        else:
-            binary_hits = ['0' if c == 0 else '1' for c in array_features]
-            out_csv_str += "," + ','.join(binary_hits)
-    return out_csv_str
-
-
-def write_pickled_output(files: List[str], outfile: str, csv_header: str):
-    with open(outfile, "w") as of:
-        of.write(csv_header)
-        for file in files:
-            with open(file, "rb") as handle:
-                data = pickle.load(handle)
-            of.write("\n".join(data) + "\n")
-            del data
-
-
-
         
 if __name__ == '__main__':
 
@@ -120,99 +72,35 @@ def write_pickled_output(files: List[str], outfile: str, csv_header: str):
 
     parser.add_argument('--kmers', required=True, type=str, help='List of kmers as a comma separated string e.g. \"AGG,GA,GG\"')
     parser.add_argument('--fasta', required=True, type=is_valid_file, help='Sequences to extract features from as a FASTA file')
-    parser.add_argument('--threads', type=int, default=1, help='Number of threads used for processing (default: 1) (WARNING: threads > 1 will impair stdout prints')
-    parser.add_argument('--batchsize', type=int, default=10000, help='If the number of processed fasta sequences is greater than batch size batch processing will be applied. This will lower memory consumption (default: 10000)')
     parser.add_argument('--report-counts', action='store_true', help='Whether to report counts as integer, default is binary nohit(0)-hit(1)'),
     parser.add_argument('--out-csv', type=str, default='stdout', help='CSV File name to write counts, pass "stdout" for stdout ')
     parser.add_argument('--minE-subopt', default=-3, type=int, help='Minimum free energy of the position on RNAsubopt result')
     parser.add_argument('--minE-intarna', default=-3, type=int, help='Minimum free energy of the position on IntaRNA result')
-    parser.add_argument('--feature-context', default="ashu", type=str, help='feature groups (contexts) are to be generated by case-insensitive single letter'
-        +'\n\ta - any context (just k-mer occurrence)'
-        +'\n\ts - unpaired in stable intra-molecular structures'
-          +'\n\th - unpaired in stable inter-molecular homo-duplex RRIs'
-          +'\n\tu - unpaired in both in (s) and (h)')
- 
+
     args = parser.parse_args()
     print(args)
     out_csv_str = "id"
     kmers_list = args.kmers.split(',')
     for kmer in kmers_list:
-        if "a" in args.feature_context.lower():
-            out_csv_str += ",{}_any".format(kmer)
-        if "s" in args.feature_context.lower():
-            out_csv_str += ",{}_intra".format(kmer)
-        if "h" in args.feature_context.lower():
-            out_csv_str += ",{}_dimer".format(kmer)
-        if "u" in args.feature_context.lower():            
-            out_csv_str += ",{}_free".format(kmer)
-        
+        out_csv_str += ",{}_any,{}_intra,{}_dimer,{}_free".format(kmer,kmer,kmer,kmer)
     out_csv_str += '\n'
-    if args.threads == 1:
-        for r in SeqIO.parse(args.fasta, format='fasta'):
-            print(r.id)
-            out_csv_str += r.id
-            seq_subopt, seq_intarna = get_subopt_intarna_strs(str(r.seq), minE_subopt=args.minE_subopt, minE_intarna=args.minE_intarna)
-            for kmer in kmers_list:
-                hseq, hsubopt, hintarna, hsubopt_intarna = find_hits_all(str(r.seq),seq_subopt,seq_intarna, kmer)
-                print(kmer, hseq, hsubopt, hintarna, hsubopt_intarna)
-                cseq, csubopt, cintarna, csubopt_intarna  = len(hseq), len(hsubopt), len(hintarna), len(hsubopt_intarna)
-                array_features = []
-                if "a" in args.feature_context.lower():
-                    array_features.append(cseq)
-                if "s" in args.feature_context.lower():
-                    array_features.append(csubopt)
-                if "h" in args.feature_context.lower():
-                    array_features.append(cintarna)
-                if "u" in args.feature_context.lower():
-                    array_features.append(csubopt_intarna)
-
-                if args.report_counts is True:
-                    out_csv_str += ''.join([',{}'.format(f) for f in array_features])
-                else:
-                    binary_hits = ['0' if c==0 else '1' for c in array_features]
-                    out_csv_str += ","+','.join(binary_hits)
-            out_csv_str += '\n'
-
-        if args.out_csv == "stdout":
-            print(out_csv_str)
-        else:
-            with open(args.out_csv, 'w') as outfile:
-                outfile.write(out_csv_str)
-    else:
-
-        calls = []
-        for seq_record in SeqIO.parse(args.fasta, format='fasta'):
-            calls.append((seq_record, args))
-
-        if args.batchsize < len(calls):
-            tmp_dir = TemporaryDirectory(prefix="BrainDead")
-            files = []
-            batch_calls = [calls[x:x+args.batchsize] for x in range(0, len(calls), args.batchsize)]
-            for x, batch in enumerate(batch_calls):
-                with Pool(processes=args.threads) as pool:
-                    outstrings = pool.starmap(multicore_wrapper, batch)
-                file = os.path.join(tmp_dir.name, f"batch_{x}.pckl")
-                files.append(file)
-                with open(file, "wb") as handle:
-                    pickle.dump(outstrings, handle)
-            write_pickled_output(files=files,
-                                 outfile=args.out_csv,
-                                 csv_header=out_csv_str)
-
-
-        else:
-            with Pool(processes=args.threads) as pool:
-                outstrings = pool.starmap(multicore_wrapper, calls)
-
-            out_csv_str += "\n".join(outstrings) + "\n"
-
-            if args.out_csv == "stdout":
-                print(out_csv_str)
+    for r in SeqIO.parse(args.fasta, format='fasta'):
+        print(r.id)
+        out_csv_str += r.id
+        seq_subopt, seq_intarna = get_subopt_intarna_strs(str(r.seq), minE_subopt=args.minE_subopt, minE_intarna=args.minE_intarna)
+        for kmer in kmers_list:
+            hseq, hsubopt, hintarna, hsubopt_intarna = find_hits_all(str(r.seq),seq_subopt,seq_intarna, kmer)
+            print(kmer, hseq, hsubopt, hintarna, hsubopt_intarna)
+            cseq, csubopt, cintarna, csubopt_intarna  = len(hseq), len(hsubopt), len(hintarna), len(hsubopt_intarna)
+            if args.report_counts is True:
+                out_csv_str += ',{},{},{},{}'.format(cseq, csubopt, cintarna, csubopt_intarna)
             else:
-                with open(args.out_csv, 'w') as outfile:
-                    outfile.write(out_csv_str)
-
-
-
-
+                binary_hits = ['0' if c==0 else '1' for c in [cseq, csubopt, cintarna, csubopt_intarna]] 
+                out_csv_str += ","+','.join(binary_hits)
+        out_csv_str += '\n'
 
+    if args.out_csv == "stdout":
+        print(out_csv_str)
+    else:
+        with open(args.out_csv, 'w') as outfile:
+            outfile.write(out_csv_str)