BioGeMT · stephaniesamm · Feb 7, 2025 · Feb 13, 2025 · Feb 13, 2025 · Feb 13, 2025
diff --git a/code/clustering/gene_fasta.py b/code/clustering/gene_fasta.py
@@ -9,8 +9,6 @@ def convert_tsv_to_fasta(input_file, output_file):
         for index, row in data.iterrows():
             sequence = row['gene']
             fasta_file.write(f">Seq_{index + 1}\n{sequence}\n")
-
-    print(f"FASTA file created: {output_file}")
 
 def main():
 

diff --git a/code/clustering/map_gene_clusters.py b/code/clustering/map_gene_clusters.py
@@ -9,7 +9,6 @@ def main(cluster_csv, dataset_tsv, output_tsv):
     gene_df["gene_cluster_ID"] = clusters_df["Cluster_ID"]
 
     gene_df.to_csv(output_tsv, sep="\t", index=False)
-    print(f"Results saved to {output_tsv}")
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="Map cluster IDs to sequences and merge with gene data.")

diff --git a/code/filter_interactions/add_seed_types.py b/code/filter_interactions/add_seed_types.py
@@ -0,0 +1,32 @@
+from miRBench.encoder import get_encoder
+from miRBench.predictor import get_predictor
+import pandas as pd
+import argparse
+
+def add_seeds(df):
+    seed_types = ["Seed6mer", "Seed6merBulgeOrMismatch"]
+    for tool in seed_types:       
+        encoder = get_encoder(tool)
+        predictor = get_predictor(tool)
+        encoded_input = encoder(df)
+        output = predictor(encoded_input)
+        df[tool] = output
+    return df
+
+def main():
+    parser = argparse.ArgumentParser(description="Add seed types via miRBench")
+    parser.add_argument("--ifile", type=str, help="Input file")
+    parser.add_argument("--ofile", type=str, help="Output file with seed types")
+    args = parser.parse_args()
+
+    # Read input file
+    df = pd.read_csv(args.ifile, sep='\t')
+
+    # Add seed types
+    df_seedtypes = add_seeds(df)
+
+    # Write seed types to file
+    df_seedtypes.to_csv(args.ofile, sep='\t', index=False)
+
+if __name__ == "__main__":
+    main()
diff --git a/code/filter_interactions/add_seed_types_README.md b/code/filter_interactions/add_seed_types_README.md
@@ -0,0 +1,32 @@
+# Seed Type Annotation
+
+## Overview
+
+This script annotates seed types using the miRBench package.
+
+## Features
+
+- Indicates the presence of Seed6mer and Seed6merBulgeOrMismatch per noncodingRNA:gene pair, using the **miRBench** seed encoders and predictors.
+
+## Requirements
+
+- Python 3.8
+- Required Python packages:
+  - `miRBench`
+  - `pandas`
+  - `argparse`
+
+## Usage
+
+Run the script with the following command:
+
+```bash
+python add_seed_types.py --ifile <input_file> --ofile <output_file>
+```
+
+### Arguments
+
+- `--ifile` (required): Path to input file with `noncodingRNA` and `gene` columns
+- `--ofile` (required): Path to output file with added seed types (columns: `Seed6mer`, `Seed6merBulgeOrMismatch`)
+
+
diff --git a/code/filter_interactions/filter_interactions.py b/code/filter_interactions/filter_interactions.py
@@ -0,0 +1,43 @@
+import pandas as pd
+import argparse
+
+def filter_interactions(df):
+    # Canonical seed: Seed6mer is 1    
+    df_canonical = df[df['Seed6mer'] == 1].copy()
+    df_canonical = df_canonical.drop(columns=['Seed6mer', 'Seed6merBulgeOrMismatch'])
+
+    # Note that in miRBench package, Seed6merBulgeOrMismatch is inclusive of Seed6mer
+    # Non-canonical seed: Seed6merBulgeOrMismatch is 1 AND Seed6mer is 0
+    df_noncanonical = df.loc[(df["Seed6merBulgeOrMismatch"] == 1) & (df["Seed6mer"] == 0)].copy()
+    df_noncanonical = df_noncanonical.drop(columns=['Seed6mer', 'Seed6merBulgeOrMismatch'])
+
+    # No seed: Seed6merBulgeOrMismatch is 0
+    df_noseed = df[df['Seed6merBulgeOrMismatch'] == 0].copy()
+    df_noseed = df_noseed.drop(columns=['Seed6mer', 'Seed6merBulgeOrMismatch'])
+
+    return df_canonical, df_noncanonical, df_noseed
+
+def write_interactions(df, ofile):
+    df.to_csv(ofile, sep='\t', index=False)
+
+def main():
+    parser = argparse.ArgumentParser(description="Filter canonical/non-canonical/no-seed interactions, for all Manakov datasets")
+    parser.add_argument("--ifile", type=str, help="Input file with seed types")
+    parser.add_argument("--canonical_ofile", type=str, help="Output file for canonical seed types")
+    parser.add_argument("--noncanonical_ofile", type=str, help="Output file for noncanonical seed types")
+    parser.add_argument("--nonseed_ofile", type=str, help="Output file for nonseed types")
+    args = parser.parse_args()
+
+    # Read file with seed types
+    df_seed_types = pd.read_csv(args.ifile, sep='\t')
+
+    # Filter canonical/non-canonical/non-seed interactions
+    df_canonical, df_noncanonical, df_noseed = filter_interactions(df_seed_types)
+
+    # Write interactions to file
+    write_interactions(df_canonical, args.canonical_ofile)
+    write_interactions(df_noncanonical, args.noncanonical_ofile)
+    write_interactions(df_noseed, args.nonseed_ofile)
+
+if __name__ == "__main__":
+    main()
diff --git a/code/filter_interactions/filter_interactions_README.md b/code/filter_interactions/filter_interactions_README.md
@@ -0,0 +1,37 @@
+# Filter interactions
+
+## Overview
+
+This script filters a file with annotated seed types into canonical, non-canonical, and non-seed interactions.
+
+## Features
+
+- Defines canonical, non-canonical, and non-seed interactions.
+- Filters canonical, non-canonical, and non-seed interactions, and saves them into 3 distinct files.
+
+## Requirements
+
+- Python 3.8
+- Required Python packages:
+  - `pandas`
+  - `argparse`
+
+## Usage
+
+Run the script with the following command:
+
+```bash
+python filter_interactions.py --ifile <input_file_with_seed_types> --canonical_ofile <output_file_with_canonical_interactions> --noncanonical_ofile <output_with_noncanonical_interactions> --nonseed <output_with_nonseed_interactions>
+```
+
+### Arguments
+
+- `--ifile`: Path to input file with seed type annotations (columns: Seed6mer, Seed6merBulgeOrMismatch). This is output of add_seed_types.py script. 
+- `--canonical_ofile`: Path to output file containing canonical (Seed6mer) interactions
+- `--noncanonical_ofile`: Path to output file containing non-canonical (Seed6merBulgeOrMismatch but not Seed6mer) interactions. 
+- `--nonseed_ofile`: Path to output file containing non-seed (No Seed6merBulgeOrMismatch) interactions.  
+
+## Note
+
+Note that in the miRBench package, Seed6merBulgeOrMismatch is the most loose seed type and therefore include all other seed types defined in the miRBench package, including the Seed6mer. 
+
diff --git a/code/make_neg_sets/README.md b/code/make_neg_sets/README.md
@@ -14,6 +14,8 @@ This means that the negatives produced for a miRNA family will have the same miR
 
 Additionally the gene targets from a single cluster are only picked once per miRNA family, ensuring that the miRNA-gene pairs that have the same miRNA family will have gene targets that are not similar to eachother (are not in the same cluster).
 
+Furthermore, it is also ensured that the negatives selected are also filtered for the interaction type relevant to a particular set of positives (canonical / noncanonical / nonseed). This may exclude or downsample positives in the input file if not enough valid negative candidates are found.
+
 This script can only produce 1:1 positive to negative class ratio. 
 
 The script processes input data block-wise to optimize memory usage and efficiently handle large datasets.
@@ -33,13 +35,14 @@ The script processes input data block-wise to optimize memory usage and efficien
   - `argparse`
   - `pandas`
   - `time`
+  - `mirbench `
 
 ## Usage
 
 ### Command
 
 ```
-python make_neg_sets.py --ifile <input_file> --ofile <output_file>
+python make_neg_sets.py --ifile <input_file> --ofile <output_file> --interaction_type <type_of_interaction>
 ```
 
 ### Arguments
@@ -50,16 +53,12 @@ python make_neg_sets.py --ifile <input_file> --ofile <output_file>
 - `--ofile` (required):  
   Path to the output file where both positive and negative examples will be saved.
 
-## Input file format
+- `--interaction_type` (required): 
+  Interaction type for which the input file has previously been filtered. One of: 'nonseed', 'canonicalseed', or 'noncanonicalseed'.
 
-The input file must:
+## Input file format
 
-1. Be a tab-separated file (TSV) that is sorted by `noncodingRNA_fam` to ensure correct block-wise processing. 
-2. Contain the following columns (including but not limited to):
-    - noncodingRNA_fam: miRNA family identifier.
-    - gene_cluster_ID: Cluster ID for genes.
-    - noncodingRNA: miRNA sequence.
-    - Additional columns such as gene, feature, chr, etc.
+The input file must be a tab-separated file (TSV) that is sorted by `noncodingRNA_fam` to ensure correct block-wise processing.
 
 ## Output file format
 
@@ -73,4 +72,4 @@ The output file will:
 
 ## Notes
 
-Negative examples are generated only when there are sufficient candidates to choose from. An error is raised otherwise. 
+Negative examples are generated only when there are sufficient valid candidates to choose from. A message is printed if positives have been downsampled or excluded from the output file.