diff --git a/code/clustering/gene_fasta.py b/code/clustering/gene_fasta.py index 60f8405..04a8f7e 100644 --- a/code/clustering/gene_fasta.py +++ b/code/clustering/gene_fasta.py @@ -9,8 +9,6 @@ def convert_tsv_to_fasta(input_file, output_file): for index, row in data.iterrows(): sequence = row['gene'] fasta_file.write(f">Seq_{index + 1}\n{sequence}\n") - - print(f"FASTA file created: {output_file}") def main(): diff --git a/code/clustering/map_gene_clusters.py b/code/clustering/map_gene_clusters.py index f31689d..f327eaa 100644 --- a/code/clustering/map_gene_clusters.py +++ b/code/clustering/map_gene_clusters.py @@ -9,7 +9,6 @@ def main(cluster_csv, dataset_tsv, output_tsv): gene_df["gene_cluster_ID"] = clusters_df["Cluster_ID"] gene_df.to_csv(output_tsv, sep="\t", index=False) - print(f"Results saved to {output_tsv}") if __name__ == "__main__": parser = argparse.ArgumentParser(description="Map cluster IDs to sequences and merge with gene data.") diff --git a/code/filter_interactions/add_seed_types.py b/code/filter_interactions/add_seed_types.py new file mode 100644 index 0000000..e0a96e6 --- /dev/null +++ b/code/filter_interactions/add_seed_types.py @@ -0,0 +1,32 @@ +from miRBench.encoder import get_encoder +from miRBench.predictor import get_predictor +import pandas as pd +import argparse + +def add_seeds(df): + seed_types = ["Seed6mer", "Seed6merBulgeOrMismatch"] + for tool in seed_types: + encoder = get_encoder(tool) + predictor = get_predictor(tool) + encoded_input = encoder(df) + output = predictor(encoded_input) + df[tool] = output + return df + +def main(): + parser = argparse.ArgumentParser(description="Add seed types via miRBench") + parser.add_argument("--ifile", type=str, help="Input file") + parser.add_argument("--ofile", type=str, help="Output file with seed types") + args = parser.parse_args() + + # Read input file + df = pd.read_csv(args.ifile, sep='\t') + + # Add seed types + df_seedtypes = add_seeds(df) + + # Write seed types to file + df_seedtypes.to_csv(args.ofile, sep='\t', index=False) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/code/filter_interactions/add_seed_types_README.md b/code/filter_interactions/add_seed_types_README.md new file mode 100644 index 0000000..b51efd3 --- /dev/null +++ b/code/filter_interactions/add_seed_types_README.md @@ -0,0 +1,32 @@ +# Seed Type Annotation + +## Overview + +This script annotates seed types using the miRBench package. + +## Features + +- Indicates the presence of Seed6mer and Seed6merBulgeOrMismatch per noncodingRNA:gene pair, using the **miRBench** seed encoders and predictors. + +## Requirements + +- Python 3.8 +- Required Python packages: + - `miRBench` + - `pandas` + - `argparse` + +## Usage + +Run the script with the following command: + +```bash +python add_seed_types.py --ifile --ofile +``` + +### Arguments + +- `--ifile` (required): Path to input file with `noncodingRNA` and `gene` columns +- `--ofile` (required): Path to output file with added seed types (columns: `Seed6mer`, `Seed6merBulgeOrMismatch`) + + diff --git a/code/filter_interactions/filter_interactions.py b/code/filter_interactions/filter_interactions.py new file mode 100644 index 0000000..8f976ce --- /dev/null +++ b/code/filter_interactions/filter_interactions.py @@ -0,0 +1,43 @@ +import pandas as pd +import argparse + +def filter_interactions(df): + # Canonical seed: Seed6mer is 1 + df_canonical = df[df['Seed6mer'] == 1].copy() + df_canonical = df_canonical.drop(columns=['Seed6mer', 'Seed6merBulgeOrMismatch']) + + # Note that in miRBench package, Seed6merBulgeOrMismatch is inclusive of Seed6mer + # Non-canonical seed: Seed6merBulgeOrMismatch is 1 AND Seed6mer is 0 + df_noncanonical = df.loc[(df["Seed6merBulgeOrMismatch"] == 1) & (df["Seed6mer"] == 0)].copy() + df_noncanonical = df_noncanonical.drop(columns=['Seed6mer', 'Seed6merBulgeOrMismatch']) + + # No seed: Seed6merBulgeOrMismatch is 0 + df_noseed = df[df['Seed6merBulgeOrMismatch'] == 0].copy() + df_noseed = df_noseed.drop(columns=['Seed6mer', 'Seed6merBulgeOrMismatch']) + + return df_canonical, df_noncanonical, df_noseed + +def write_interactions(df, ofile): + df.to_csv(ofile, sep='\t', index=False) + +def main(): + parser = argparse.ArgumentParser(description="Filter canonical/non-canonical/no-seed interactions, for all Manakov datasets") + parser.add_argument("--ifile", type=str, help="Input file with seed types") + parser.add_argument("--canonical_ofile", type=str, help="Output file for canonical seed types") + parser.add_argument("--noncanonical_ofile", type=str, help="Output file for noncanonical seed types") + parser.add_argument("--nonseed_ofile", type=str, help="Output file for nonseed types") + args = parser.parse_args() + + # Read file with seed types + df_seed_types = pd.read_csv(args.ifile, sep='\t') + + # Filter canonical/non-canonical/non-seed interactions + df_canonical, df_noncanonical, df_noseed = filter_interactions(df_seed_types) + + # Write interactions to file + write_interactions(df_canonical, args.canonical_ofile) + write_interactions(df_noncanonical, args.noncanonical_ofile) + write_interactions(df_noseed, args.nonseed_ofile) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/code/filter_interactions/filter_interactions_README.md b/code/filter_interactions/filter_interactions_README.md new file mode 100644 index 0000000..05afe0b --- /dev/null +++ b/code/filter_interactions/filter_interactions_README.md @@ -0,0 +1,37 @@ +# Filter interactions + +## Overview + +This script filters a file with annotated seed types into canonical, non-canonical, and non-seed interactions. + +## Features + +- Defines canonical, non-canonical, and non-seed interactions. +- Filters canonical, non-canonical, and non-seed interactions, and saves them into 3 distinct files. + +## Requirements + +- Python 3.8 +- Required Python packages: + - `pandas` + - `argparse` + +## Usage + +Run the script with the following command: + +```bash +python filter_interactions.py --ifile --canonical_ofile --noncanonical_ofile --nonseed +``` + +### Arguments + +- `--ifile`: Path to input file with seed type annotations (columns: Seed6mer, Seed6merBulgeOrMismatch). This is output of add_seed_types.py script. +- `--canonical_ofile`: Path to output file containing canonical (Seed6mer) interactions +- `--noncanonical_ofile`: Path to output file containing non-canonical (Seed6merBulgeOrMismatch but not Seed6mer) interactions. +- `--nonseed_ofile`: Path to output file containing non-seed (No Seed6merBulgeOrMismatch) interactions. + +## Note + +Note that in the miRBench package, Seed6merBulgeOrMismatch is the most loose seed type and therefore include all other seed types defined in the miRBench package, including the Seed6mer. + diff --git a/code/make_neg_sets/README.md b/code/make_neg_sets/README.md index b3ec1eb..2c659bb 100644 --- a/code/make_neg_sets/README.md +++ b/code/make_neg_sets/README.md @@ -14,6 +14,8 @@ This means that the negatives produced for a miRNA family will have the same miR Additionally the gene targets from a single cluster are only picked once per miRNA family, ensuring that the miRNA-gene pairs that have the same miRNA family will have gene targets that are not similar to eachother (are not in the same cluster). +Furthermore, it is also ensured that the negatives selected are also filtered for the interaction type relevant to a particular set of positives (canonical / noncanonical / nonseed). This may exclude or downsample positives in the input file if not enough valid negative candidates are found. + This script can only produce 1:1 positive to negative class ratio. The script processes input data block-wise to optimize memory usage and efficiently handle large datasets. @@ -33,13 +35,14 @@ The script processes input data block-wise to optimize memory usage and efficien - `argparse` - `pandas` - `time` + - `mirbench ` ## Usage ### Command ``` -python make_neg_sets.py --ifile --ofile +python make_neg_sets.py --ifile --ofile --interaction_type ``` ### Arguments @@ -50,16 +53,12 @@ python make_neg_sets.py --ifile --ofile - `--ofile` (required): Path to the output file where both positive and negative examples will be saved. -## Input file format +- `--interaction_type` (required): + Interaction type for which the input file has previously been filtered. One of: 'nonseed', 'canonicalseed', or 'noncanonicalseed'. -The input file must: +## Input file format -1. Be a tab-separated file (TSV) that is sorted by `noncodingRNA_fam` to ensure correct block-wise processing. -2. Contain the following columns (including but not limited to): - - noncodingRNA_fam: miRNA family identifier. - - gene_cluster_ID: Cluster ID for genes. - - noncodingRNA: miRNA sequence. - - Additional columns such as gene, feature, chr, etc. +The input file must be a tab-separated file (TSV) that is sorted by `noncodingRNA_fam` to ensure correct block-wise processing. ## Output file format @@ -73,4 +72,4 @@ The output file will: ## Notes -Negative examples are generated only when there are sufficient candidates to choose from. An error is raised otherwise. +Negative examples are generated only when there are sufficient valid candidates to choose from. A message is printed if positives have been downsampled or excluded from the output file. diff --git a/code/make_neg_sets/make_neg_sets.py b/code/make_neg_sets/make_neg_sets.py index 9cf22dc..5eb031a 100644 --- a/code/make_neg_sets/make_neg_sets.py +++ b/code/make_neg_sets/make_neg_sets.py @@ -2,6 +2,8 @@ import pandas as pd import time import hashlib +from miRBench.encoder import get_encoder +from miRBench.predictor import get_predictor # Yield blocks of positive examples with the same mirnafam to process at a time def yield_mirnafam_blocks(positive_file_path): @@ -29,20 +31,41 @@ def yield_mirnafam_blocks(positive_file_path): block_df = pd.DataFrame(current_block, columns=header_columns) yield block_df -def process_block(block, positive_samples, all_clusters, output_file): +# Get seed types for the given dataframe +def get_seeds(df): + seed_types = ["Seed6mer", "Seed6merBulgeOrMismatch"] + for tool in seed_types: + encoder = get_encoder(tool) + predictor = get_predictor(tool) + encoded_input = encoder(df) + output = predictor(encoded_input) + df[tool] = output + return df + +def make_reproducibility_seed(string) + + # Generate a SHA-256 hash and get the hexadecimal string + miRNA_hash_hex = hashlib.sha256(string.encode()).hexdigest() + # Convert the hexadecimal hash to a decimal integer + miRNA_hash_int = int(miRNA_hash_hex, 16) + # Reduce the size using modulo (e.g., within the range of a 32-bit unsigned integer) and set it as the seed + seed = miRNA_hash_int % 4294967295 - # Set a fixed seed for reproducibility - ## Get the first item from 'noncodingRNA_name' - miRNA_name = block['noncodingRNA_name'].iloc[0] + return seed - ## Generate a SHA-256 hash and get the hexadecimal string - miRNA_hash_hex = hashlib.sha256(miRNA_name.encode()).hexdigest() +def process_valid_negatives(valid_negatives, block_columns): - ## Convert the hexadecimal hash to a decimal integer - miRNA_hash_int = int(miRNA_hash_hex, 16) + valid_negatives = valid_negatives.drop(columns=['Seed6mer', 'Seed6merBulgeOrMismatch']) + valid_negatives['label'] = 0 + valid_negatives = valid_negatives[block_columns] - ## Reduce the size using modulo (e.g., within the range of a 32-bit unsigned integer) and set it as the seed - seed = miRNA_hash_int % 4294967295 + return valid_negatives + +def process_block(block, positive_samples, all_clusters, output_file, interaction_type): + + # Generate seed for reproducibility + miRNA_name = block['noncodingRNA_name'].iloc[0] + seed = make_reproducibility_seed(miRNA_name) # Get the set of cluster ids that share this miRNA family block_clusters = block['gene_cluster_ID'].unique().tolist() @@ -51,48 +74,104 @@ def process_block(block, positive_samples, all_clusters, output_file): mirfam_allowed_clusters = [cluster for cluster in all_clusters if cluster not in block_clusters] # Pool gene rows from allowed clusters - negative_pool = positive_samples[positive_samples['gene_cluster_ID'].isin(mirfam_allowed_clusters)] + negative_gene_pool = positive_samples[positive_samples['gene_cluster_ID'].isin(mirfam_allowed_clusters)] - # Shuffle the negative pool and drop duplicates based on ClusterID - negative_pool = negative_pool.sample(frac=1, random_state=seed).drop_duplicates(subset=['gene_cluster_ID'], keep='first') + # Shuffle the negative gene pool and drop duplicates based on ClusterID + negative_gene_pool = negative_gene_pool.sample(frac=1, random_state=seed).drop_duplicates(subset=['gene_cluster_ID'], keep='first') + + # Make list of unique noncodingRNA values in block + unique_mirnas = block['noncodingRNA'].unique().tolist() - # Get the number of negatives to be generated for this miRNA family block - num_neg = block.shape[0] + # Compute occurrences of each unique miRNA in the block + mirna_counts = block['noncodingRNA'].value_counts() + + # Define columns + gene_columns = ['gene', 'feature', 'test', 'chr', 'start', 'end', 'strand', 'gene_cluster_ID'] + mirna_columns = ['noncodingRNA', 'noncodingRNA_name', 'noncodingRNA_fam'] + seed_columns = ['Seed6mer', 'Seed6merBulgeOrMismatch'] - if num_neg > len(negative_pool): - raise ValueError(f"Warning: Not enough negative examples for current block. miRNA family: {block['noncodingRNA_fam'].iloc[0]}, first miRNA sequence: {block['noncodingRNA'].iloc[0]}") + for mirna in unique_mirnas: + # Get frequency of the current miRNA + mirna_frequency = mirna_counts[mirna] - # Sample num_neg from mirfam_allowed_genes rows - negative_genes = negative_pool.sample(n=num_neg, random_state=seed) + # Initialize valid_negatives dataframe + valid_negatives = pd.DataFrame(columns=gene_columns + mirna_columns + seed_columns) - # Start constructing the df rows for the negative examples - columns = ['gene', 'feature', 'test', 'chr', 'start', 'end', 'strand', 'gene_cluster_ID'] - negatives_df = negative_genes[columns].copy() + # Increment seed for each miRNA + seed += 1 - # Add the miRNA sequence, name and family columns from block to negatives_df by index - negatives_df['noncodingRNA'] = block['noncodingRNA'].values - negatives_df['noncodingRNA_name'] = block['noncodingRNA_name'].values - negatives_df['noncodingRNA_fam'] = block['noncodingRNA_fam'].values + # Shuffle the negative gene pool with incrementing seed for each miRNA + negative_gene_pool = negative_gene_pool.sample(frac=1, random_state=seed) - # Add the label column to negatives_df - negatives_df['label'] = 0 + # Iterate over each row of the negative gene pool + for index, row in negative_gene_pool.iterrows(): - # Reorder columns in negatives_df to match the order in block - negatives_df = negatives_df[block.columns] + # Get gene columns from the row + negative_candidate = pd.DataFrame([row])[gene_columns].copy() - # Append positive examples for this block to the output file - block.to_csv(output_file, sep='\t', index=False, header=False, mode='a') + # Add miRNA columns to the negative candidate + negative_candidate['noncodingRNA'] = mirna + negative_candidate['noncodingRNA_name'] = block[block['noncodingRNA'] == mirna]['noncodingRNA_name'].iloc[0] # Assumes that the name is the same for all occurrences of the miRNA + negative_candidate['noncodingRNA_fam'] = block[block['noncodingRNA'] == mirna]['noncodingRNA_fam'].iloc[0] # Assumes that the family is the same for all occurrences of the miRNA + + # Compute seeds for the negative candidate + negative_candidate = get_seeds(negative_candidate) + + # Filter the negative candidates based on interaction type + if interaction_type == 'nonseed': + negative_candidate = negative_candidate[negative_candidate['Seed6merBulgeOrMismatch'] == 0] + elif interaction_type == 'canonicalseed': + negative_candidate = negative_candidate[negative_candidate['Seed6mer'] == 1] + elif interaction_type == 'noncanonicalseed': + negative_candidate = negative_candidate[(negative_candidate['Seed6mer'] == 0) & (negative_candidate['Seed6merBulgeOrMismatch'] == 1)] + + # If negative candidate is empty + if negative_candidate.empty: + continue + # If negative candidate contains something + else: + # Append the negative candidate to the valid negatives df + valid_negatives = pd.concat([valid_negatives, negative_candidate], ignore_index=True) + + # If there are enough valid negatives (as many as the frequency of the miRNA in the miRNA family block) + if len(valid_negatives) >= mirna_frequency: - # Append negative examples for this block to the output file - negatives_df.to_csv(output_file, sep='\t', index=False, header=False, mode='a') + # Get block rows for which column noncodingRNA == mirna and save to file (positives) + block_mirna = block[block['noncodingRNA'] == mirna].copy() + block_mirna.to_csv(output_file, sep='\t', index=False, header=False, mode='a') + + # Slice the valid negatives df to the required frequency, process valid negatives, and save to file (negatives) + valid_negatives = valid_negatives.iloc[:mirna_frequency].copy() + valid_negatives = process_valid_negatives(valid_negatives, block.columns) + valid_negatives.to_csv(output_file, sep='\t', index=False, header=False, mode='a') + # Exit the loop to move on to the next unique miRNA in the block + break + # If there are not enough valid negatives + else: + # Check if the end of the negative gene pool is reached (edge case) + if index == len(negative_gene_pool) - 1: + if not valid_negatives.empty: + block_mirna = block[block['noncodingRNA'] == mirna].iloc[:len(valid_negatives)].copy() + block_mirna.to_csv(output_file, sep='\t', index=False, header=False, mode='a') + + valid_negatives = process_valid_negatives(valid_negatives, block.columns) + valid_negatives.to_csv(output_file, sep='\t', index=False, header=False, mode='a') + + # Print a message if the end of the negative gene pool is reached and there are still not enough valid negatives + print(f"Missing {mirna_frequency - len(valid_negatives)} negatives for miRNA: {mirna}. Positive examples downsampled to retain 1:1 class ratio.", flush=True) + else: + # Print a message if the end of the negative gene pool is reached and there are no valid negatives + print(f"No valid negatives for miRNA: {mirna}. Excluding it from positive and negative examples.", flush=True) def main(): # Record start time start = time.time() + # Parse command line arguments parser = argparse.ArgumentParser(description="Generate negative examples.") parser.add_argument('--ifile', type=str, required=True, help="Input file name, MUST BE SORTED by 'miRNA family!'") parser.add_argument('--ofile', type=str, required=True, help="Output file name") + parser.add_argument('--interaction_type', choices=['nonseed', 'canonicalseed', 'noncanonicalseed'], required=True, help="Interactions type to use for generating negative examples") args = parser.parse_args() # Read the entire positive examples file @@ -106,6 +185,7 @@ def main(): with open(args.ofile, 'a') as ofile: + # Iterate over blocks of positive examples with the same miRNA family for block in yield_mirnafam_blocks(args.ifile): # Check if the miRNA family is unknown @@ -118,16 +198,16 @@ def main(): for mirna in unique_mirnas: # Get the block of rows for this miRNA sequence - sub_block = block[block['noncodingRNA'] == mirna] + sub_block = block[block['noncodingRNA'] == mirna].copy() # Run rest of code for each sub_block - process_block(sub_block, positive_samples, all_clusters, args.ofile) + process_block(sub_block, positive_samples, all_clusters, args.ofile, args.interaction_type) print(f"Processed miRNA sequence block: {sub_block['noncodingRNA'].iloc[0]}", flush=True) else: - # Process the block normally if not 'unknown' - process_block(block, positive_samples, all_clusters, args.ofile) + # Process the block normally if miRNA family not 'unknown' + process_block(block, positive_samples, all_clusters, args.ofile, args.interaction_type) print(f"Processed miRNA family block: {block['noncodingRNA_fam'].iloc[0]}", flush=True) diff --git a/code/post_process/README.md b/code/post_process/README.md index 255f04d..a228e62 100644 --- a/code/post_process/README.md +++ b/code/post_process/README.md @@ -4,28 +4,30 @@ This series of pipelines is designed to process as input, the HybriDetector `*.unified_length_all_types_unique_high_confidence.tsv` output files. -It is intended to be used on the following datasets: -- https://github.com/ML-Bioinfo-CEITEC/HybriDetector/blob/main/ML/Datasets/AGO2_CLASH_Hejret2023_full_dataset.tsv -- https://github.com/ML-Bioinfo-CEITEC/miRBind/blob/main/Datasets/AGO2_eCLIP_Klimentova22_full_dataset.tsv +It is intended to be used on the following dataset: - https://zenodo.org/records/14501607/files/AGO2_eCLIP_Manakov2022_full_dataset.tsv.gz -Note that for the Hejret and Klimentova datasets above, the `miRNA_fam` column must be renamed to `noncodingRNA_fam` after downloading, prior to any processing, for consistency of all column names. +The scope of the pipeline series is to filter the files for miRNA data, deduplicate gene-miRNA sequence pairs, create a left-out test set with miRNA families unique only to this set, annotate seed types and filter canonical, non-canonical, and non-seed interaction types into different files, construct the negative class in an unbiased and class balanced manner, split the datasets into training and testing, and finally add conservation score to the gene sequences. -The scope of the pipeline series is to filter the files for miRNA data, deduplicate gene-miRNA sequence pairs, create a left-out test set with miRNA families unique only to this set, construct the negative class in an unbiased manner, split the datasets into training and testing, and finally add conservation score to the gene sequences. - - -The series is composed of 6 pipelines (listed below) and are intended to be run in the defined order as the output of one feeds the next. Refer to the worflow diagram. +The series is composed of 7 pipelines (listed below) and are intended to be run in the defined order as the output of one feeds the next. Refer to the worflow diagram. 1. postprocess_0_filter_and_deduplicate 2. postprocess_1_exclude_mirna_families -3. postprocess_2_make_negatives -4. postprocess_3_train_test_splits -5. postprocess_4_drop_test_col -6. postprocess_4_add_conservation +3. postprocess_1a_add_seed_types_and_filter_interactions +4. postprocess_2_make_negatives +5. postprocess_3_train_test_splits +6. postprocess_4_drop_test_col +7. postprocess_5_add_conservation ## Requirements - Python 3 -- Run `conda env create --file=post_process.yml`, then `conda activate postprocess` +- Create an environment and make sure the following packages are installed: + - pandas + - miRBench (v1.0.0) + - r-base + - r-biostrings + - r-decipher + - pyBigWig - Manually download the `hg38.phyloP100way.bw` and `hg38.phastCons100way.bw` files from: - https://hgdownload.cse.ucsc.edu/goldenPath/hg38/phyloP100way/ - https://hgdownload.cse.ucsc.edu/goldenpath/hg38/phastCons100way/ @@ -33,6 +35,8 @@ The series is composed of 6 pipelines (listed below) and are intended to be run - `../filtering/filtering.py` - `../excluded_families_testset/unique_family_counter.py` - `../excluded_families_testset/dataset_split_based_on_unique_families.py` + - `../filter_interactions/add_seed_types.py` + - `../filter_interactions/filter_interactions.py` - `../clustering/gene_fasta.py` - `../clustering/clustering.R ` - `../clustering/map_gene_clusters.py` @@ -41,4 +45,4 @@ The series is composed of 6 pipelines (listed below) and are intended to be run ## Usage -Each pipeline of the series must be run separately. Refer to the corresponding README files. +Each pipeline of the series must be run separately. Refer to the corresponding README files. \ No newline at end of file diff --git a/code/post_process/post_process.yml b/code/post_process/post_process.yml deleted file mode 100644 index 7321511..0000000 --- a/code/post_process/post_process.yml +++ /dev/null @@ -1,15 +0,0 @@ -name: postprocess -channels: - - bioconda - - conda-forge - - defaults - - hcc -dependencies: - - pandas - - r-base - - r-biostrings - - r-decipher - - pip - - pip: - - pyBigWig - - python-Levenshtein diff --git a/code/post_process/postprocess_1a_add_seedtypes_and_filter_interactions.sh b/code/post_process/postprocess_1a_add_seedtypes_and_filter_interactions.sh new file mode 100644 index 0000000..57cf4cf --- /dev/null +++ b/code/post_process/postprocess_1a_add_seedtypes_and_filter_interactions.sh @@ -0,0 +1,72 @@ +#!/bin/bash + +#SBATCH --account=ssamm10 +#SBATCH --job-name=pp_1a +#SBATCH --ntasks=1 +#SBATCH --cpus-per-task=30 + +# parse command-line arguments +while getopts i:o:n: flag; do + case "${flag}" in + i) input_dir=${OPTARG};; + o) output_dir=${OPTARG};; + n) intermediate_dir=${OPTARG};; + esac +done + +# check if required argument is provided +if [ ! -d "$input_dir" ] || [ ! -d "$output_dir" ] || [ ! -d "$intermediate_dir" ]; then + echo "Usage: $0 -i input_dir -o output_dir -n intermediate_dir" + exit 1 +fi + +# define log file in the output directory +log_file="$output_dir/postprocess_1a_add_seedtypes_and_filter_interactions.log" + +# redirect all output to the log file +exec > >(tee -a "$log_file") 2>&1 + +# check for permission issues with log file +if [ $? -ne 0 ]; then + echo "Error setting up log file. Check your permissions." + exit 1 +fi + +# define paths to the directories where the scripts are located +add_seeds_and_filtering_dir="../filter_interactions" + +# define constants for suffixes with extensions +SEED_TYPES_SUFFIX=".seed_types" +CANONICAL_SUFFIX=".canonical6mer" +NON_CANONICAL_SUFFIX=".noncanonical6mer" +NON_SEED_SUFFIX=".nonseed" + +for input_file in "$input_dir"/*.tsv; do + # process the input file + base_name=$(basename "$input_file" .tsv) + seed_types_output="$intermediate_dir/${base_name}${SEED_TYPES_SUFFIX}.tsv" + canonical_output="$output_dir/${base_name}${CANONICAL_SUFFIX}.tsv" + noncanonical_output="$output_dir/${base_name}${NON_CANONICAL_SUFFIX}.tsv" + nonseed_output="$output_dir/${base_name}${NON_SEED_SUFFIX}.tsv" + + # Step 1: Adding seed types to the input file + echo "Adding seed types to $input_file..." + python3 "$add_seeds_and_filtering_dir/add_seed_types.py" --ifile "$input_file" --ofile "$seed_types_output" + if [ $? -ne 0 ]; then + echo "Error in adding seed types. Check your script and input file." + exit 1 + fi + echo "Seed types added to $input_file. Output saved to $seed_types_output" + + # Step 2: Filtering interactions based on seed types + echo "Filtering interactions for $seed_types_output..." + python3 "$add_seeds_and_filtering_dir/filter_interactions.py" --ifile "$seed_types_output" --canonical_ofile "$canonical_output" --noncanonical_ofile "$noncanonical_output" --nonseed_ofile "$nonseed_output" + if [ $? -ne 0 ]; then + echo "Error in filtering interactions. Check your script and input file." + exit 1 + fi + echo "Interactions filtered for $seed_types_output. Canonical interactions saved to $canonical_output, noncanonical interactions saved to $noncanonical_output, and nonseed interactions saved to $nonseed_output" + +done + +echo "Adding seed_types and filtering interactions pipeline completed successfully." \ No newline at end of file diff --git a/code/post_process/postprocess_1a_add_seedtypes_and_filter_interactions_README.md b/code/post_process/postprocess_1a_add_seedtypes_and_filter_interactions_README.md new file mode 100644 index 0000000..52bf45c --- /dev/null +++ b/code/post_process/postprocess_1a_add_seedtypes_and_filter_interactions_README.md @@ -0,0 +1,29 @@ +# postprocess_1a_add_seed_types_and_filter_interactions_README + +## Usage + +Can be submitted to SLURM workload manager with `sbatch` or: +```bash +postprocess_1a_add_seed_types_and_filter_interactions.sh -i input_dir -o output_dir -n intermediate_dir +``` +### Parameters + +`-i`: Input directory containing .tsv files +`-o`: Output directory for final files +`-n`: Intermediate directory for intermediate files + +## Pipeline Steps + +1. **Adding seed types**: Refer to ../filter_interactions/add_seed_types_README.md +2. **Filtering interactions**: Refer to ../filter_interactions/filter_interactions_README.md + +## Intermediate Files + +- Input file with added seed types (Seed6mer, Seed6merBulgeOrMismatch): `{input_filename}.seed_types.tsv` + +## Output Files + +- Input file filtered for canonical seed: `{input_filename}.canonical6mer.tsv` +- Input file filtered for non-canonical seed: `{input_filename}.noncanonical6mer.tsv` +- Input file filtered for non-seed interactions: `{input_filename}.nonseed.tsv` +- Log file: `postprocess_1a_add_seedtypes_and_filter_interactions.log` \ No newline at end of file diff --git a/code/post_process/postprocess_2_make_negatives.sh b/code/post_process/postprocess_2_make_negatives.sh index 60de31c..0f70c1a 100644 --- a/code/post_process/postprocess_2_make_negatives.sh +++ b/code/post_process/postprocess_2_make_negatives.sh @@ -6,17 +6,18 @@ #SBATCH --cpus-per-task=30 # parse command-line arguments -while getopts i:o:n: flag; do +while getopts i:o:n:t: flag; do case "${flag}" in i) input_dir=${OPTARG};; o) output_dir=${OPTARG};; n) intermediate_dir=${OPTARG};; + t) interaction_type=${OPTARG};; esac done # check if required argument is provided -if [ ! -d "$input_dir" ] || [ ! -d "$output_dir" ] || [ ! -d "$intermediate_dir" ]; then - echo "Usage: $0 -i input_dir -o output_dir -n intermediate_dir" +if [ ! -d "$input_dir" ] || [ ! -d "$output_dir" ] || [ ! -d "$intermediate_dir" ] || [ -z "$interaction_type" ]; then + echo "Usage: $0 -i input_dir -o output_dir -n intermediate_dir -t interaction_type" exit 1 fi @@ -82,7 +83,7 @@ for input_file in "$input_dir"/*.tsv; do echo "Sorting the input file with added clusters based on the noncodingRNA_fam column..." # Find the column number of the "noncodingRNA_fam" column - column_number=$(head -n 1 "$input_file_with_clusters" | tr '\t' '\n' | nl -v 0 | grep "noncodingRNA_fam" | awk '{print $1}') + column_number=$(head -n 1 "$input_file_with_clusters" | tr '\t' '\n' | nl -v 1 | grep "noncodingRNA_fam" | awk '{print $1}') # If the column number is found, sort the file by that column if [ -n "$column_number" ]; then @@ -95,7 +96,7 @@ for input_file in "$input_dir"/*.tsv; do # Step 5: Make negatives echo "Generating negatives for $mirfam_sorted_file..." - python3 "$make_negs_dir/make_neg_sets.py" --ifile "$mirfam_sorted_file" --ofile "$neg_output" + python3 "$make_negs_dir/make_neg_sets.py" --ifile "$mirfam_sorted_file" --ofile "$neg_output" --interaction_type "$interaction_type" if [ $? -ne 0 ]; then echo "Error in generating negative samples. Check your script and input file." exit 1 diff --git a/code/post_process/workflow_diagram.md b/code/post_process/workflow_diagram.md index 812a35b..a3feee3 100644 --- a/code/post_process/workflow_diagram.md +++ b/code/post_process/workflow_diagram.md @@ -2,105 +2,146 @@ graph LR %% Define files M[AGO2_eCLIP_Manakov22_full_dataset.tsv] - H[AGO2_CLASH_Hejret23_full_dataset.tsv] - K[AGO2_eCLIP_Klimentova22_full_dataset.tsv] %% Process 0: pp_0_filter_and_deduplicate pp_0[pp_0_filter_and_deduplicate] M_filtered[AGO2_eCLIP_Manakov22.filtered.deduplicated.tsv] - H_filtered[AGO2_CLASH_Hejret23_full_dataset.filtered.deduplicated.tsv] - K_filtered[AGO2_eCLIP_Klimentova22_full_dataset.filtered.deduplicated.tsv] %% Process 1: pp_1_exclude_mirna_families pp_1[pp_1_exclude_mirna_families] M_excluded[AGO2_eCLIP_Manakov22.filtered.deduplicated.excluded.tsv] M_remaining[AGO2_eCLIP_Manakov22.filtered.deduplicated.remaining.tsv] + + %% Process 1a: pp_1a_add_seeds_and_filter_interactions + pp_1a[pp_1a_add_seeds_and_filter_interactions] + M_excluded_canonical[AGO2_eCLIP_Manakov22.filtered.deduplicated.excluded.canonical.tsv] + M_excluded_noncanonical[AGO2_eCLIP_Manakov22.filtered.deduplicated.excluded.noncanonical.tsv] + M_excluded_nonseed[AGO2_eCLIP_Manakov22.filtered.deduplicated.excluded.nonseed.tsv] + M_remaining_canonical[AGO2_eCLIP_Manakov22.filtered.deduplicated.remaining.canonical.tsv] + M_remaining_noncanonical[AGO2_eCLIP_Manakov22.filtered.deduplicated.remaining.noncanonical.tsv] + M_remaining_nonseed[AGO2_eCLIP_Manakov22.filtered.deduplicated.remaining.nonseed.tsv] %% Process 2: pp_2_make_negatives pp_2[pp_2_make_negatives] - M_neg[AGO2_eCLIP_Manakov22.filtered.deduplicated.remaining.gene_clusters_added.mirfam_sorted.negatives.tsv] - H_neg[AGO2_CLASH_Hejret23_full_dataset.filtered.deduplicated.gene_clusters_added.mirfam_sorted.negatives.tsv] - K_neg[AGO2_eCLIP_Klimentova22_full_dataset.filtered.deduplicated.gene_clusters_added.mirfam_sorted.negatives.tsv] - M_excluded_neg[AGO2_eCLIP_Manakov22.filtered.deduplicated.excluded.gene_clusters_added.mirfam_sorted.negatives.tsv] - - %% Process 3: pp_3_train_test_splits + M_excluded_canonical_neg[AGO2_eCLIP_Manakov22.filtered.deduplicated.excluded.canonical.gene_clusters_added.mirfam_sorted.negatives.tsv] + M_excluded_noncanonical_neg[AGO2_eCLIP_Manakov22.filtered.deduplicated.excluded.noncanonical.gene_clusters_added.mirfam_sorted.negatives.tsv] + M_excluded_nonseed_neg[AGO2_eCLIP_Manakov22.filtered.deduplicated.excluded.nonseed.gene_clusters_added.mirfam_sorted.negatives.tsv] + M_remaining_canonical_neg[AGO2_eCLIP_Manakov22.filtered.deduplicated.remaining.canonical.gene_clusters_added.mirfam_sorted.negatives.tsv] + M_remaining_noncanonical_neg[AGO2_eCLIP_Manakov22.filtered.deduplicated.remaining.noncanonical.gene_clusters_added.mirfam_sorted.negatives.tsv] + M_remaining_nonseed_neg[AGO2_eCLIP_Manakov22.filtered.deduplicated.remaining.nonseed.gene_clusters_added.mirfam_sorted.negatives.tsv] + + %% Process 3: pp_3_train_test_splits pp_3[pp_3_train_test_splits] - M_neg_train[AGO2_eCLIP_Manakov22.filtered.deduplicated.remaining.gene_clusters_added.mirfam_sorted.negatives.train.tsv] - M_neg_test[AGO2_eCLIP_Manakov22.filtered.deduplicated.remaining.gene_clusters_added.mirfam_sorted.negatives.test.tsv] - H_neg_train[AGO2_CLASH_Hejret23_full_dataset.filtered.deduplicated.gene_clusters_added.mirfam_sorted.negatives.train.tsv] - H_neg_test[AGO2_CLASH_Hejret23_full_dataset.filtered.deduplicated.gene_clusters_added.mirfam_sorted.negatives.test.tsv] + M_remaining_canonical_neg_train[AGO2_eCLIP_Manakov22.filtered.deduplicated.remaining.canonical.gene_clusters_added.mirfam_sorted.negatives.train.tsv] + M_remaining_canonical_neg_test[AGO2_eCLIP_Manakov22.filtered.deduplicated.remaining.canonical.gene_clusters_added.mirfam_sorted.negatives.test.tsv] + M_remaining_noncanonical_neg_train[AGO2_eCLIP_Manakov22.filtered.deduplicated.remaining.noncanonical.gene_clusters_added.mirfam_sorted.negatives.train.tsv] + M_remaining_noncanonical_neg_test[AGO2_eCLIP_Manakov22.filtered.deduplicated.remaining.noncanonical.gene_clusters_added.mirfam_sorted.negatives.test.tsv] + M_remaining_nonseed_neg_train[AGO2_eCLIP_Manakov22.filtered.deduplicated.remaining.nonseed.gene_clusters_added.mirfam_sorted.negatives.train.tsv] + M_remaining_nonseed_neg_test[AGO2_eCLIP_Manakov22.filtered.deduplicated.remaining.nonseed.gene_clusters_added.mirfam_sorted.negatives.test.tsv] - %% Process 4: pp_4_drop_test_col + %% Process 4: pp_4_drop_test_col pp_4[pp_4_drop_test_col] - M_train_drop[AGO2_eCLIP_Manakov22.filtered.deduplicated.remaining.gene_clusters_added.mirfam_sorted.negatives.train.drop_test_col.tsv] - M_test_drop[AGO2_eCLIP_Manakov22.filtered.deduplicated.remaining.gene_clusters_added.mirfam_sorted.negatives.test.drop_test_col.tsv] - H_train_drop[AGO2_CLASH_Hejret23_full_dataset.filtered.deduplicated.gene_clusters_added.mirfam_sorted.negatives.train.drop_test_col.tsv] - H_test_drop[AGO2_CLASH_Hejret23_full_dataset.filtered.deduplicated.gene_clusters_added.mirfam_sorted.negatives.test.drop_test_col.tsv] - K_neg_drop[AGO2_eCLIP_Klimentova22_full_dataset.filtered.deduplicated.gene_clusters_added.mirfam_sorted.negatives.drop_test_col.tsv] - M_excluded_drop[AGO2_eCLIP_Manakov22.filtered.deduplicated.excluded.gene_clusters_added.mirfam_sorted.negatives.drop_test_col.tsv] + M_remaining_canonical_neg_train_drop[AGO2_eCLIP_Manakov22.filtered.deduplicated.remaining.canonical.gene_clusters_added.mirfam_sorted.negatives.train.drop_test_col.tsv] + M_remaining_canonical_neg_test_drop[AGO2_eCLIP_Manakov22.filtered.deduplicated.remaining.canonical.gene_clusters_added.mirfam_sorted.negatives.test.drop_test_col.tsv] + M_remaining_noncanonical_neg_train_drop[AGO2_eCLIP_Manakov22.filtered.deduplicated.remaining.noncanonical.gene_clusters_added.mirfam_sorted.negatives.train.drop_test_col.tsv] + M_remaining_noncanonical_neg_test_drop[AGO2_eCLIP_Manakov22.filtered.deduplicated.remaining.noncanonical.gene_clusters_added.mirfam_sorted.negatives.test.drop_test_col.tsv] + M_remaining_nonseed_neg_train_drop[AGO2_eCLIP_Manakov22.filtered.deduplicated.remaining.nonseed.gene_clusters_added.mirfam_sorted.negatives.train.drop_test_col.tsv] + M_remaining_nonseed_neg_test_drop[AGO2_eCLIP_Manakov22.filtered.deduplicated.remaining.nonseed.gene_clusters_added.mirfam_sorted.negatives.test.drop_test_col.tsv] + %% Process 4: pp_4_drop_test_col + pp_4[pp_4_drop_test_col] + M_excluded_canonical_neg_drop[AGO2_eCLIP_Manakov22.filtered.deduplicated.excluded.canonical.gene_clusters_added.mirfam_sorted.negatives.drop_test_col.tsv] + M_excluded_noncanonical_neg_drop[AGO2_eCLIP_Manakov22.filtered.deduplicated.excluded.noncanonical.gene_clusters_added.mirfam_sorted.negatives.drop_test_col.tsv] + M_excluded_nonseed_neg_drop[AGO2_eCLIP_Manakov22.filtered.deduplicated.excluded.nonseed.gene_clusters_added.mirfam_sorted.negatives.drop_test_col.tsv] %% Process 5: pp_5_add_conservation pp_5[pp_5_add_conservation] - M_train_conserv[AGO2_eCLIP_Manakov22.filtered.deduplicated.remaining.gene_clusters_added.mirfam_sorted.negatives.train.drop_test_col.conservation.tsv] - M_test_conserv[AGO2_eCLIP_Manakov22.filtered.deduplicated.remaining.gene_clusters_added.mirfam_sorted.negatives.test.drop_test_col.conservation.tsv] - H_train_conserv[AGO2_CLASH_Hejret23_full_dataset.filtered.deduplicated.gene_clusters_added.mirfam_sorted.negatives.train.drop_test_col.conservation.tsv] - H_test_conserv[AGO2_CLASH_Hejret23_full_dataset.filtered.deduplicated.gene_clusters_added.mirfam_sorted.negatives.test.drop_test_col.conservation.tsv] - K_neg_conserv[AGO2_eCLIP_Klimentova22_full_dataset.filtered.deduplicated.gene_clusters_added.mirfam_sorted.negatives.drop_test_col.conservation.tsv] - M_excluded_conserv[AGO2_eCLIP_Manakov22.filtered.deduplicated.excluded.gene_clusters_added.mirfam_sorted.negatives.drop_test_col.conservation.tsv] + M_remaining_canonical_neg_train_conserv[AGO2_eCLIP_Manakov22.filtered.deduplicated.remaining.canonical.gene_clusters_added.mirfam_sorted.negatives.train.drop_test_col.conservation.tsv] + M_remaining_canonical_neg_test_conserv[AGO2_eCLIP_Manakov22.filtered.deduplicated.remaining.canonical.gene_clusters_added.mirfam_sorted.negatives.test.drop_test_col.conservation.tsv] + M_remaining_noncanonical_neg_train_conserv[AGO2_eCLIP_Manakov22.filtered.deduplicated.remaining.noncanonical.gene_clusters_added.mirfam_sorted.negatives.train.drop_test_col.conservation.tsv] + M_remaining_noncanonical_neg_test_conserv[AGO2_eCLIP_Manakov22.filtered.deduplicated.remaining.noncanonical.gene_clusters_added.mirfam_sorted.negatives.test.drop_test_col.conservation.tsv] + M_remaining_nonseed_neg_train_conserv[AGO2_eCLIP_Manakov22.filtered.deduplicated.remaining.nonseed.gene_clusters_added.mirfam_sorted.negatives.train.drop_test_col.conservation.tsv] + M_remaining_nonseed_neg_test_conserv[AGO2_eCLIP_Manakov22.filtered.deduplicated.remaining.nonseed.gene_clusters_added.mirfam_sorted.negatives.test.drop_test_col.conservation.tsv] + M_excluded_canonical_neg_conserv[AGO2_eCLIP_Manakov22.filtered.deduplicated.excluded.canonical.gene_clusters_added.mirfam_sorted.negatives.drop_test_col.conservation.tsv] + M_excluded_noncanonical_neg_conserv[AGO2_eCLIP_Manakov22.filtered.deduplicated.excluded.noncanonical.gene_clusters_added.mirfam_sorted.negatives.drop_test_col.conservation.tsv] + M_excluded_nonseed_neg_conserv[AGO2_eCLIP_Manakov22.filtered.deduplicated.excluded.nonseed.gene_clusters_added.mirfam_sorted.negatives.drop_test_col.conservation.tsv] %% Connections M --> pp_0 - H --> pp_0 - K --> pp_0 pp_0 --> M_filtered - pp_0 --> H_filtered - pp_0 --> K_filtered M_filtered --> pp_1 pp_1 --> M_excluded pp_1 --> M_remaining - M_remaining --> pp_2 - M_excluded --> pp_2 - H_filtered --> pp_2 - K_filtered --> pp_2 - pp_2 --> M_neg - pp_2 --> H_neg - pp_2 --> K_neg - pp_2 --> M_excluded_neg - M_neg --> pp_3 - H_neg --> pp_3 - pp_3 --> M_neg_train - pp_3 --> M_neg_test - pp_3 --> H_neg_train - pp_3 --> H_neg_test - M_neg_train --> pp_4 - M_neg_test --> pp_4 - H_neg_train --> pp_4 - H_neg_test --> pp_4 - K_neg --> pp_4 - M_excluded_neg --> pp_4 - pp_4 --> M_train_drop - pp_4 --> M_test_drop - pp_4 --> H_train_drop - pp_4 --> H_test_drop - pp_4 --> K_neg_drop - pp_4 --> M_excluded_drop - M_train_drop --> pp_5 - M_test_drop --> pp_5 - H_train_drop --> pp_5 - H_test_drop --> pp_5 - K_neg_drop --> pp_5 - M_excluded_drop --> pp_5 - pp_5 --> M_train_conserv - pp_5 --> M_test_conserv - pp_5 --> H_train_conserv - pp_5 --> H_test_conserv - pp_5 --> K_neg_conserv - pp_5 --> M_excluded_conserv + M_excluded --> pp_1a + M_remaining --> pp_1a + pp_1a --> M_excluded_canonical + pp_1a --> M_excluded_noncanonical + pp_1a --> M_excluded_nonseed + pp_1a --> M_remaining_canonical + pp_1a --> M_remaining_noncanonical + pp_1a --> M_remaining_nonseed + M_excluded_canonical --> pp_2 + M_excluded_noncanonical --> pp_2 + M_excluded_nonseed --> pp_2 + M_remaining_canonical --> pp_2 + M_remaining_noncanonical --> pp_2 + M_remaining_nonseed --> pp_2 + pp_2 --> M_excluded_canonical_neg + pp_2 --> M_excluded_noncanonical_neg + pp_2 --> M_excluded_nonseed_neg + pp_2 --> M_remaining_canonical_neg + pp_2 --> M_remaining_noncanonical_neg + pp_2 --> M_remaining_nonseed_neg + M_remaining_canonical_neg --> pp_3 + M_remaining_noncanonical_neg --> pp_3 + M_remaining_nonseed_neg --> pp_3 + pp_3 --> M_remaining_canonical_neg_train + pp_3 --> M_remaining_canonical_neg_test + pp_3 --> M_remaining_noncanonical_neg_train + pp_3 --> M_remaining_noncanonical_neg_test + pp_3 --> M_remaining_nonseed_neg_train + pp_3 --> M_remaining_nonseed_neg_test + M_remaining_canonical_neg_train --> pp_4 + M_remaining_canonical_neg_test --> pp_4 + M_remaining_noncanonical_neg_train --> pp_4 + M_remaining_noncanonical_neg_test --> pp_4 + M_remaining_nonseed_neg_train --> pp_4 + M_remaining_nonseed_neg_test --> pp_4 + M_excluded_canonical_neg --> pp_4 + M_excluded_noncanonical_neg --> pp_4 + M_excluded_nonseed_neg --> pp_4 + pp_4 --> M_remaining_canonical_neg_train_drop + pp_4 --> M_remaining_canonical_neg_test_drop + pp_4 --> M_remaining_noncanonical_neg_train_drop + pp_4 --> M_remaining_noncanonical_neg_test_drop + pp_4 --> M_remaining_nonseed_neg_train_drop + pp_4 --> M_remaining_nonseed_neg_test_drop + pp_4 --> M_excluded_canonical_neg_drop + pp_4 --> M_excluded_noncanonical_neg_drop + pp_4 --> M_excluded_nonseed_neg_drop + M_remaining_canonical_neg_train_drop --> pp_5 + M_remaining_canonical_neg_test_drop --> pp_5 + M_remaining_noncanonical_neg_train_drop --> pp_5 + M_remaining_noncanonical_neg_test_drop --> pp_5 + M_remaining_nonseed_neg_train_drop --> pp_5 + M_remaining_nonseed_neg_test_drop --> pp_5 + M_excluded_canonical_neg_drop --> pp_5 + M_excluded_noncanonical_neg_drop --> pp_5 + M_excluded_nonseed_neg_drop --> pp_5 + pp_5 --> M_remaining_canonical_neg_train_conserv + pp_5 --> M_remaining_canonical_neg_test_conserv + pp_5 --> M_remaining_noncanonical_neg_train_conserv + pp_5 --> M_remaining_noncanonical_neg_test_conserv + pp_5 --> M_remaining_nonseed_neg_train_conserv + pp_5 --> M_remaining_nonseed_neg_test_conserv + pp_5 --> M_excluded_canonical_neg_conserv + pp_5 --> M_excluded_noncanonical_neg_conserv + pp_5 --> M_excluded_nonseed_neg_conserv %% Define file and process styles classDef file fill:#E0F7FA,stroke:#00796B,stroke-width:2px,color:black; classDef process fill:#FFE0B2,stroke:#E65100,stroke-width:2px,color:black; %% Apply styles - class M,H,K,M_filtered,H_filtered,K_filtered,M_excluded,M_remaining,M_neg,H_neg,K_neg,M_excluded_neg,M_neg_train,M_neg_test,H_neg_train,H_neg_test,M_train_drop,M_test_drop,H_train_drop,H_test_drop,K_neg_drop,M_excluded_drop,M_train_conserv,M_test_conserv,H_train_conserv,H_test_conserv,K_neg_conserv,M_excluded_conserv file; - class pp_0,pp_1,pp_2,pp_3,pp_4,pp_5 process; \ No newline at end of file + class M,M_filtered,M_excluded,M_remaining,M_excluded_canonical,M_excluded_noncanonical,M_excluded_nonseed,M_remaining_canonical,M_remaining_noncanonical,M_remaining_nonseed,M_excluded_canonical_neg,M_excluded_noncanonical_neg,M_excluded_nonseed_neg,M_remaining_canonical_neg,M_remaining_noncanonical_neg,M_remaining_nonseed_neg,M_remaining_canonical_neg_train,M_remaining_canonical_neg_test,M_remaining_noncanonical_neg_train,M_remaining_noncanonical_neg_test,M_remaining_nonseed_neg_train,M_remaining_nonseed_neg_test,M_excluded_canonical_neg_drop,M_excluded_noncanonical_neg_drop,M_excluded_nonseed_neg_drop,M_remaining_canonical_neg_train_drop,M_remaining_canonical_neg_test_drop,M_remaining_noncanonical_neg_train_drop,M_remaining_noncanonical_neg_test_drop,M_remaining_nonseed_neg_train_drop,M_remaining_nonseed_neg_test_drop,M_excluded_canonical_neg_conserv,M_excluded_noncanonical_neg_conserv,M_excluded_nonseed_neg_conserv,M_remaining_canonical_neg_train_conserv,M_remaining_canonical_neg_test_conserv,M_remaining_noncanonical_neg_train_conserv,M_remaining_noncanonical_neg_test_conserv,M_remaining_nonseed_neg_train_conserv,M_remaining_nonseed_neg_test_conserv file; + class pp_0,pp_1,pp_1a,pp_2,pp_3,pp_4,pp_5 process; +