diff --git a/analysis/miRBind_CNN_retraining_orig_parameters/README.md b/analysis/miRBind_CNN_retraining_orig_parameters/README.md new file mode 100644 index 0000000..de651fb --- /dev/null +++ b/analysis/miRBind_CNN_retraining_orig_parameters/README.md @@ -0,0 +1,15 @@ +# miRBind CNN retraining with original parameters + +Run +```bash run_retraining.sh``` +to retrain the miRBind CNN as presented in the [miRBind paper](https://doi.org/10.3390/genes13122323) on Manakov 1:1 train dataset. +The training is done with the original hyperparameters used in the paper. + +### Dependencies + +- python=3.8 +- tensorflow=2.13 +- matplotlib +- numpy +- pandas + diff --git a/analysis/miRBind_CNN_retraining_orig_parameters/run_retraining.sh b/analysis/miRBind_CNN_retraining_orig_parameters/run_retraining.sh new file mode 100644 index 0000000..3afd191 --- /dev/null +++ b/analysis/miRBind_CNN_retraining_orig_parameters/run_retraining.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +DATASET="../../data/chimeric_datasets/Manakov2022/AGO_eCLIP_Manakov2022_1_train_dataset.tsv" +MODEL="../../models/miRBind_CNN_retrained_Manakov_1_orig_parameters.keras" +CODE="../../code/machine_learning" + +mkdir -p encoded_dataset + +# encode dataset +python $CODE/encode/binding_2D_matrix_encoder.py --i_file $DATASET --o_prefix encoded_dataset/AGO2_eCLIP_Manakov2022_1_train + +# train model +python $CODE/train/CNN_miRBind_2022/miRBind_CNN_training_orig_parameters.py \ +--data encoded_dataset/AGO2_eCLIP_Manakov2022_1_train_dataset.npy \ +--labels encoded_dataset/AGO2_eCLIP_Manakov2022_1_train_labels.npy \ +--dataset_size 2524246 \ +--ratio 1 \ +--model $MODEL \ No newline at end of file diff --git a/code/machine_learning/encode/README.md b/code/machine_learning/encode/README.md index 79fe152..697c03e 100644 --- a/code/machine_learning/encode/README.md +++ b/code/machine_learning/encode/README.md @@ -1 +1,17 @@ -# Encoding the dataset into inner representation \ No newline at end of file +# Encoding the dataset into inner representation + +### [Binding 2D matrix encoder](binding_2d_matrix_encoder.py) + The encoder is based on the "miRBind: A deep learning method for miRNA binding classification." (2022) https://doi.org/10.3390/genes13122323 + with original python implementation here: https://github.com/ML-Bioinfo-CEITEC/miRBind + +Encodes miRNA and gene sequences into 2D-binding matrix. +2D-binding matrix has shape (gene_max_len=50, miRNA_max_len=20, 1) and contains 1 for Watson-Crick interactions and 0 otherwise. + +Outputs npy file with encoded matrices and npy file with corresponding labels. + +#### Usage +Run the script from the command line with the following syntax: + + +```python binding_2d_matrix_encoder.py --i_file input_dataset_file.tsv --o_prefix output_prefix``` + diff --git a/code/machine_learning/encode/binding_2D_matrix_encoder.py b/code/machine_learning/encode/binding_2D_matrix_encoder.py index 70b6360..484e534 100644 --- a/code/machine_learning/encode/binding_2D_matrix_encoder.py +++ b/code/machine_learning/encode/binding_2D_matrix_encoder.py @@ -1,37 +1,102 @@ -class miRBindEncoder(): +import pandas as pd +import numpy as np +import argparse +import time + + +def binding_encoding(df, alphabet, tensor_dim=(50, 20, 1)): + """ + Transform input sequence pairs to a binding matrix with corresponding labels. + + Parameters: + - df: Pandas DataFrame with columns "noncodingRNA", "gene", "label" + - alphabet: dictionary with letter tuples as keys and 1s when they bind + - tensor_dim: 2D binding matrix shape + + Output: + 2D binding matrix, labels as np array + """ + labels = df["label"].to_numpy() + + # Initialize dot matrix with zeros + ohe_matrix_2d = np.zeros((len(df), *tensor_dim), dtype="float32") + + df = df.reset_index(drop=True) + + # Compile matrix with Watson-Crick interactions + for index, row in df.iterrows(): + for bind_index, bind_nt in enumerate(row['gene'].upper()): + for ncrna_index, ncrna_nt in enumerate(row['noncodingRNA'].upper()): + if ncrna_index >= tensor_dim[1]: + break + base_pairs = bind_nt + ncrna_nt + ohe_matrix_2d[index, bind_index, ncrna_index, 0] = alphabet.get(base_pairs, 0) + + return ohe_matrix_2d, labels + + +def encode_large_tsv_to_numpy(tsv_file_path, data_output_path, labels_output_path, chunk_size=10000): + """ + Encode a large TSV file into a NumPy matrix using chunk processing. + + Parameters: + - tsv_file_path: Path to the TSV file with dataset. + - data_output_path: Path to the output data .npy file. + - labels_output_path: Path to the output labels .npy file. + - chunk_size: Number of rows to process at a time. """ - Based on Klimentová, Eva, et al. "miRBind: A deep learning method for miRNA binding classification." Genes 13.12 (2022): 2323. https://doi.org/10.3390/genes13122323. - Python implementation: https://github.com/ML-Bioinfo-CEITEC/miRBind + # Alphabet for Watson-Crick interactions + alphabet = {"AT": 1., "TA": 1., "GC": 1., "CG": 1.} + tensor_dim = (50, 20, 1) + + # Get total number of rows in the dataset + num_rows = sum(len(df) for df in pd.read_csv(tsv_file_path, sep='\t', usecols=[0], chunksize=chunk_size)) + + # Determine the shape of the output arrays + labels_shape = (num_rows,) + data_shape = (num_rows, *tensor_dim) + + # Create memory-mapped files + ohe_matrix_2d = np.memmap(data_output_path, dtype='float32', mode='w+', shape=data_shape) + labels = np.memmap(labels_output_path, dtype='float32', mode='w+', shape=labels_shape) + + row_offset = 0 + + # Process each chunk + for chunk in pd.read_csv(tsv_file_path, sep='\t', chunksize=chunk_size): + encoded_data, encoded_labels = binding_encoding(chunk, alphabet, tensor_dim) + + # Write the chunk's data and labels to the memory-mapped files + ohe_matrix_2d[row_offset:row_offset + len(chunk)] = encoded_data + labels[row_offset:row_offset + len(chunk)] = encoded_labels + row_offset += len(chunk) + + # Flush changes to disk + ohe_matrix_2d.flush() + labels.flush() + + +def main(): + """ + Based on "miRBind: A deep learning method for miRNA binding classification." Genes 13.12 (2022): 2323. https://doi.org/10.3390/genes13122323. + Original implementation: https://github.com/ML-Bioinfo-CEITEC/miRBind Encodes miRNA and gene sequences into 2D-binding matrix. - 2D-binding matrix has shape (gene_max_len, miRNA_max_len, 1) and contains 1 for Watson-Crick interactions and 0 otherwise. - Returns array with shape (num_of_samples, gene_max_len, miRNA_max_len, 1). + 2D-binding matrix has shape (gene_max_len=50, miRNA_max_len=20, 1) and contains 1 for Watson-Crick interactions and 0 otherwise. """ - def __call__(self, df, miRNA_col="noncodingRNA", gene_col="gene", tensor_dim=(50, 20, 1)): - return self.binding_encoding(df, miRNA_col, gene_col, tensor_dim) - - def binding_encoding(self, df, miRNA_col, gene_col, tensor_dim): - """ - fun encodes miRNAs and mRNAs in df into binding matrices - :param df: dataframe containing gene_col and miRNA_col columns - :param tensor_dim: output shape of the matrix. If sequences are longer than tensor_dim, they will be truncated. - :return: 2D binding matrix with shape (N, *tensor_dim) - """ - - # alphabet for watson-crick interactions. - alphabet = {"AT": 1., "TA": 1., "GC": 1., "CG": 1., "AU": 1., "UA": 1.} - # create empty main 2d matrix array - N = df.shape[0] # number of samples in df - shape_matrix_2d = (N, *tensor_dim) # 2d matrix shape - # initialize dot matrix with zeros - ohe_matrix_2d = np.zeros(shape_matrix_2d, dtype="float32") - - # compile matrix with watson-crick interactions. - for index, row in df.iterrows(): - for bind_index, bind_nt in enumerate(row[gene_col][:tensor_dim[0]].upper()): - for mirna_index, mirna_nt in enumerate(row[miRNA_col][:tensor_dim[1]].upper()): - base_pairs = bind_nt + mirna_nt - ohe_matrix_2d[index, bind_index, mirna_index, 0] = alphabet.get(base_pairs, 0) - - return ohe_matrix_2d \ No newline at end of file + parser = argparse.ArgumentParser( + description="Encode dataset to miRNA x target binding matrix. Outputs numpy file with matrices and and numpy file with corresponding labels. Expected columns of the dataset are 'noncodingRNA', 'gene' and 'label'") + parser.add_argument('-i', '--i_file', type=str, required=True, help="Input dataset file name") + parser.add_argument('-o', '--o_prefix', type=str, required=True, help="Output file name prefix") + args = parser.parse_args() + + start = time.time() + encode_large_tsv_to_numpy(args.i_file, args.o_prefix + '_dataset.npy', args.o_prefix + '_labels.npy') + end = time.time() + + print("Elapsed time: ", end - start, " s.") + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/code/machine_learning/train/CNN_miRBind_2022/miRBind_CNN_architecture.py b/code/machine_learning/train/CNN_miRBind_2022/miRBind_CNN_architecture.py new file mode 100644 index 0000000..48d8bef --- /dev/null +++ b/code/machine_learning/train/CNN_miRBind_2022/miRBind_CNN_architecture.py @@ -0,0 +1,60 @@ +import tensorflow as tf +from tensorflow import keras as K +from tensorflow.keras.optimizers import Adam +from tensorflow.keras.layers import Input, Conv2D, LeakyReLU, BatchNormalization, MaxPooling2D, Dropout, Flatten, Dense + + +class miRBind_CNN(): + """ + Build model architecture based on the CNN model presented in miRBind paper (2022) https://doi.org/10.3390/genes13122323 + The default parameters are same as the ones used in the paper + """ + def __init__(self, cnn_num = 6, kernel_size = 5, pool_size = 2, dropout_rate = 0.3, dense_num = 2): + + x = Input(shape=(50,20,1), dtype='float32') + main_input = x + + for cnn_i in range(cnn_num): + x = Conv2D( + filters=32 * (cnn_i + 1), + kernel_size=(kernel_size, kernel_size), + padding="same", + data_format="channels_last")(x) + x = LeakyReLU()(x) + x = BatchNormalization()(x) + x = MaxPooling2D(pool_size=(pool_size, pool_size), padding='same')(x) + x = Dropout(rate=dropout_rate)(x) + + x = Flatten()(x) + + for dense_i in range(dense_num): + neurons = 32 * (cnn_num - dense_i) + x = Dense(neurons)(x) + x = LeakyReLU()(x) + x = BatchNormalization()(x) + x = Dropout(rate=dropout_rate)(x) + + main_output = Dense(1, activation='sigmoid')(x) + + model = K.Model(inputs=[main_input], outputs=[main_output], name='miRBind_CNN') + + self.model = model + + def compile_model(self, lr=0.00152): + K.backend.clear_session() + model = self.model + + opt = Adam( + learning_rate=lr, + beta_1=0.9, + beta_2=0.999, + epsilon=1e-07, + amsgrad=False, + name="Adam") + + model.compile( + optimizer=opt, + loss='binary_crossentropy', + metrics=['accuracy'] + ) + return model \ No newline at end of file diff --git a/code/machine_learning/train/CNN_miRBind_2022/miRBind_CNN_training_orig_parameters.py b/code/machine_learning/train/CNN_miRBind_2022/miRBind_CNN_training_orig_parameters.py new file mode 100644 index 0000000..6d8e1cc --- /dev/null +++ b/code/machine_learning/train/CNN_miRBind_2022/miRBind_CNN_training_orig_parameters.py @@ -0,0 +1,139 @@ +import numpy as np +import argparse +import time +import tensorflow as tf +from tensorflow import keras as K +import matplotlib.pyplot as plt +from tensorflow.keras.utils import Sequence + +from miRBind_CNN_architecture import miRBind_CNN + + +class DataGenerator(Sequence): + # preload the encoded numpy data + def __init__(self, data_path, labels_path, dataset_size, batch_size, validation_split=0.1, + is_validation=False, shuffle=True): + # the dataset size is needed to properly load the numpy files + self.size = dataset_size + + self.data = np.memmap(data_path, dtype='float32', mode='r', shape=(self.size, 50, 20, 1)) + self.labels = np.memmap(labels_path, dtype='float32', mode='r', shape=(self.size,)) + self.batch_size = batch_size + self.shuffle = shuffle + + # Determine number of train and validation samples + self.validation_split = validation_split + self.num_samples = len(self.data) + self.num_validation_samples = int(self.num_samples * validation_split) + self.num_train_samples = self.num_samples - self.num_validation_samples + + # Determine indices for validation and training + indices = np.arange(self.num_samples) + if shuffle: + np.random.shuffle(indices) + + if is_validation: + self.indices = indices[self.num_train_samples:] + else: + self.indices = indices[:self.num_train_samples] + + # Shuffle the data initially + self.on_epoch_end() + + def __len__(self): + # Denotes the number of batches per epoch + return int(np.ceil(len(self.indices) / float(self.batch_size))) + + def __getitem__(self, idx): + # Generate one batch of data + batch_indices = self.indices[idx * self.batch_size:(idx + 1) * self.batch_size] + batch_data = self.data[batch_indices] + batch_labels = self.labels[batch_indices] + return batch_data, batch_labels + + def on_epoch_end(self): + # Updates indices after each epoch for shuffling + if self.shuffle: + np.random.shuffle(self.indices) + + +def plot_history(history, ratio): + """ + Plot history of the model training, + accuracy and loss of the training and validation set + """ + + acc = history.history['accuracy'] + val_acc = history.history['val_accuracy'] + loss = history.history['loss'] + val_loss = history.history['val_loss'] + + epochs = range(1, len(acc) + 1) + + plt.figure(figsize=(8, 6), dpi=80) + + plt.plot(epochs, acc, 'bo', label='Training acc') + plt.plot(epochs, val_acc, 'b', label='Validation acc') + plt.title('Accuracy') + plt.legend() + plt.savefig(f"training_acc_1_{ratio}.jpg") + + plt.figure() + + plt.plot(epochs, loss, 'bo', label='Training loss') + plt.plot(epochs, val_loss, 'b', label='Validation loss') + plt.title('Loss') + plt.legend() + plt.savefig(f"training_loss_1_{ratio}.jpg") + + +def train_model(data, labels, dataset_size, ratio, model_file, debug=False): + # set random state for reproducibility + np.random.seed(42) + tf.random.set_seed(42) + K.utils.set_random_seed(42) + # TODO still not fully reproducible? why? + + train_data_gen = DataGenerator(data, labels, dataset_size, batch_size=32, validation_split=0.1, + is_validation=False) + val_data_gen = DataGenerator(data, labels, dataset_size, batch_size=32, validation_split=0.1, + is_validation=True) + + model = miRBind_CNN().compile_model() + model_history = model.fit( + train_data_gen, + validation_data=val_data_gen, + epochs=10, + class_weight={0: 1, 1: ratio} + ) + + if debug: + plot_history(model_history, ratio) + + model.save(model_file) + + +def main(): + parser = argparse.ArgumentParser(description="Train CNN model on encoded miRNA x target binding matrix dataset") + parser.add_argument('--ratio', type=int, required=True, help="Ratio of pos:neg in the training dataset") + parser.add_argument('--data', type=str, required=True, help="File with the encoded dataset") + parser.add_argument('--labels', type=str, required=True, help="File with the dataset labels") + parser.add_argument('--dataset_size', type=int, required=True, + help="Number of samples in the dataset. Needed to properly load the numpy files.") + parser.add_argument('--model', type=str, required=False, help="Filename to save the trained model") + parser.add_argument('--debug', type=bool, default=False, help="Set to True to output some plots about training") + args = parser.parse_args() + + if args.model is None: + args.model = f"model_1_{args.ratio}.keras" + + start = time.time() + train_model(data=args.data, labels=args.labels, dataset_size=args.dataset_size, ratio=args.ratio, + model_file=args.model, debug=args.debug) + end = time.time() + + print("Elapsed time: ", end - start, " s.") + + +if __name__ == "__main__": + main() diff --git a/code/machine_learning/train/CNN_miRBind_2022/training.ipynb b/code/machine_learning/train/CNN_miRBind_2022/training.ipynb deleted file mode 100644 index cf24ebf..0000000 --- a/code/machine_learning/train/CNN_miRBind_2022/training.ipynb +++ /dev/null @@ -1,442 +0,0 @@ -{ - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { - "colab": { - "name": "Training.ipynb", - "provenance": [], - "collapsed_sections": [] - }, - "kernelspec": { - "name": "python3", - "display_name": "Python 3" - }, - "language_info": { - "name": "python" - } - }, - "cells": [ - { - "cell_type": "code", - "metadata": { - "id": "T6BIHgU38o2f" - }, - "source": [ - "import pandas as pd\n", - "import numpy as np\n", - "import tensorflow as tf\n", - "from tensorflow import keras as K\n", - "import matplotlib.pyplot as plt\n", - "from matplotlib.pyplot import figure\n", - "from tensorflow.keras.layers import (\n", - " BatchNormalization, LeakyReLU,\n", - " Input, Dense, Conv2D,\n", - " MaxPooling2D, Flatten, Dropout)\n", - "from tensorflow.keras.optimizers import Adam" - ], - "execution_count": 1, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "EfIQCQj2r3fV", - "outputId": "26867200-18e8-4b4e-d34d-129c7203f694" - }, - "source": [ - "!wget https://raw.githubusercontent.com/ML-Bioinfo-CEITEC/miRBind/main/Datasets/train_set_1_10_CLASH2013_paper.tsv" - ], - "execution_count": 2, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "--2022-04-24 17:22:50-- https://raw.githubusercontent.com/ML-Bioinfo-CEITEC/miRBind/main/Datasets/train_set_1_10_CLASH2013_paper.tsv\n", - "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", - "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", - "HTTP request sent, awaiting response... 200 OK\n", - "Length: 12518906 (12M) [text/plain]\n", - "Saving to: ‘train_set_1_10_CLASH2013_paper.tsv’\n", - "\n", - "train_set_1_10_CLAS 100%[===================>] 11.94M --.-KB/s in 0.09s \n", - "\n", - "2022-04-24 17:22:51 (130 MB/s) - ‘train_set_1_10_CLASH2013_paper.tsv’ saved [12518906/12518906]\n", - "\n" - ] - } - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "ALmjW7vd9u60" - }, - "source": [ - "def binding_encoding(df, tensor_dim=(50,20,1)):\n", - " \"\"\"\n", - " fun transform input database to numpy array.\n", - " \n", - " parameters:\n", - " df = Pandas df with col names \"noncodingRNA\", \"gene\", \"label\"\n", - " tensor_dim = 2d matrix shape\n", - " \n", - " output:\n", - " 2d dot matrix, labels as np array\n", - " \"\"\"\n", - " df.reset_index(inplace=True, drop=True)\n", - "\n", - " # alphabet for watson-crick interactions.\n", - " alphabet = {\"AT\": 1., \"TA\": 1., \"GC\": 1., \"CG\": 1.} \n", - "\n", - " # labels to one hot encoding\n", - " labels = df[\"label\"].to_numpy()\n", - "\n", - " # create empty main 2d matrix array\n", - " N = df.shape[0] # number of samples in df\n", - " shape_matrix_2d = (N, *tensor_dim) # 2d matrix shape \n", - " # initialize dot matrix with zeros\n", - " ohe_matrix_2d = np.zeros(shape_matrix_2d, dtype=\"float32\")\n", - "\n", - " # compile matrix with watson-crick interactions.\n", - " for index, row in df.iterrows(): \n", - " for bind_index, bind_nt in enumerate(row.gene.upper()):\n", - " \n", - " for ncrna_index, ncrna_nt in enumerate(row.noncodingRNA.upper()):\n", - " if ncrna_index >= tensor_dim[1]:\n", - " break\n", - " base_pairs = bind_nt + ncrna_nt\n", - " ohe_matrix_2d[index, bind_index, ncrna_index, 0] = alphabet.get(base_pairs, 0)\n", - " \n", - "\n", - " return ohe_matrix_2d, labels" - ], - "execution_count": 3, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "CpoytQwIElkg" - }, - "source": [ - "def make_architecture():\n", - " \"\"\"\n", - " build model architecture\n", - "\n", - " return a model object\n", - " \"\"\"\n", - " cnn_num = 6\n", - " kernel_size = 5\n", - " pool_size = 2\n", - " dropout_rate = 0.3\n", - " dense_num = 2\n", - "\n", - " x = Input(shape=(50,20,1),\n", - " dtype='float32', name='main_input'\n", - " )\n", - " main_input = x\n", - "\n", - " for cnn_i in range(cnn_num):\n", - " x = Conv2D(\n", - " filters=32 * (cnn_i + 1),\n", - " kernel_size=(kernel_size, kernel_size),\n", - " padding=\"same\",\n", - " data_format=\"channels_last\",\n", - " name=\"conv_\" + str(cnn_i + 1))(x)\n", - " x = LeakyReLU()(x)\n", - " x = BatchNormalization()(x)\n", - " x = MaxPooling2D(pool_size=(pool_size, pool_size), padding='same', name='Max_' + str(cnn_i + 1))(x)\n", - " x = Dropout(rate=dropout_rate)(x)\n", - "\n", - " x = Flatten(name='2d_matrix')(x)\n", - "\n", - " for dense_i in range(dense_num):\n", - " neurons = 32 * (cnn_num - dense_i)\n", - " x = Dense(neurons)(x)\n", - " x = LeakyReLU()(x)\n", - " x = BatchNormalization()(x)\n", - " x = Dropout(rate=dropout_rate)(x)\n", - "\n", - " main_output = Dense(1, activation='sigmoid', name='main_output')(x)\n", - "\n", - " model = K.Model(inputs=[main_input], outputs=[main_output], name='arch_00')\n", - " \n", - " return model" - ], - "execution_count": 4, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "oZ591qC0Femi" - }, - "source": [ - "def compile_model():\n", - " K.backend.clear_session()\n", - " model = make_architecture()\n", - " \n", - " opt = Adam(\n", - " learning_rate=0.00152,\n", - " beta_1=0.9,\n", - " beta_2=0.999,\n", - " epsilon=1e-07,\n", - " amsgrad=False,\n", - " name=\"Adam\")\n", - "\n", - " model.compile(\n", - " optimizer=opt,\n", - " loss='binary_crossentropy',\n", - " metrics=['accuracy']\n", - " )\n", - " return model" - ], - "execution_count": 5, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "wTB6K0lxyzcx" - }, - "source": [ - "def plot_history(history):\n", - " \"\"\"\n", - " plot history of the training of the model,\n", - " accuracy and loss of the training and validation set\n", - " \"\"\"\n", - " \n", - " acc = history.history['accuracy']\n", - " val_acc = history.history['val_accuracy']\n", - " loss = history.history['loss']\n", - " val_loss = history.history['val_loss']\n", - "\n", - " epochs = range(1, len(acc) + 1)\n", - "\n", - " plt.figure(figsize=(8, 6), dpi=80)\n", - "\n", - " plt.plot(epochs, acc, 'bo', label='Training acc')\n", - " plt.plot(epochs, val_acc, 'b', label='Validation acc')\n", - " plt.title('Accuracy')\n", - " plt.legend()\n", - " plt.figure()\n", - "\n", - " plt.plot(epochs, loss, 'bo', label='Training loss')\n", - " plt.plot(epochs, val_loss, 'b', label='Validation loss')\n", - " plt.title('Loss')\n", - " plt.legend()\n", - " plt.show()" - ], - "execution_count": 6, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "3Bh2-XEPxmZf", - "outputId": "3dffc4d8-d135-46fd-dcf4-94edb75db0cb" - }, - "source": [ - "train_df = pd.read_csv('train_set_1_10_CLASH2013_paper.tsv', sep='\\t', names=['noncodingRNA', 'gene', 'label'], header=0)\n", - "# set random state for reproducibility\n", - "RANDOM_STATE = 42\n", - "np.random.seed(RANDOM_STATE)\n", - "train_df = train_df.sample(frac=1, random_state=RANDOM_STATE)\n", - "print(train_df.head())\n", - "ohe_data = binding_encoding(train_df)\n", - "train_ohe, labels = ohe_data\n", - "print(\"Number of training samples: \", train_df.shape[0])" - ], - "execution_count": 8, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - " noncodingRNA \\\n", - "45236 ACTGCATTATGAGCACTTAA \n", - "168824 TATTGCACTTGTCCCGGCCT \n", - "2591 AAAAGCTGGGTTGAGAGGGC \n", - "76746 TCTCACACAGAAATCGCACC \n", - "63277 TGAGGTAGTAGTTTGTGCTG \n", - "\n", - " gene label \n", - "45236 GAGAAGAAATCTGGCTGGTTTGAGGGTTTCCTTTAGTTCACCCTCA... 0 \n", - "168824 GTAAATGTCTGTTTTTCATAATTGCTCTTTATATTGTGTGTTATCT... 0 \n", - "2591 GTACCCAGTAAAAACCAGAATGACCCATTGCCAGGACGCATCAAAG... 1 \n", - "76746 ACGTCGGCGCCATGCTCCAGGTACAGAGCCACATGTTGCTCCAGGC... 0 \n", - "63277 ACCAATGCCAGAGGAGCAACAGCGGCAACCTTTGGCACTGCATCCA... 0 \n", - "Number of training samples: 169312\n" - ] - } - ] - }, - { - "cell_type": "code", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "Lcc6OuabyVsK", - "outputId": "bb4e3fae-f08d-4d37-acfe-ff41c66e83eb" - }, - "source": [ - "model = compile_model()\n", - "model.summary()" - ], - "execution_count": 9, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Model: \"arch_00\"\n", - "_________________________________________________________________\n", - " Layer (type) Output Shape Param # \n", - "=================================================================\n", - " main_input (InputLayer) [(None, 50, 20, 1)] 0 \n", - " \n", - " conv_1 (Conv2D) (None, 50, 20, 32) 832 \n", - " \n", - " leaky_re_lu (LeakyReLU) (None, 50, 20, 32) 0 \n", - " \n", - " batch_normalization (BatchN (None, 50, 20, 32) 128 \n", - " ormalization) \n", - " \n", - " Max_1 (MaxPooling2D) (None, 25, 10, 32) 0 \n", - " \n", - " dropout (Dropout) (None, 25, 10, 32) 0 \n", - " \n", - " conv_2 (Conv2D) (None, 25, 10, 64) 51264 \n", - " \n", - " leaky_re_lu_1 (LeakyReLU) (None, 25, 10, 64) 0 \n", - " \n", - " batch_normalization_1 (Batc (None, 25, 10, 64) 256 \n", - " hNormalization) \n", - " \n", - " Max_2 (MaxPooling2D) (None, 13, 5, 64) 0 \n", - " \n", - " dropout_1 (Dropout) (None, 13, 5, 64) 0 \n", - " \n", - " conv_3 (Conv2D) (None, 13, 5, 96) 153696 \n", - " \n", - " leaky_re_lu_2 (LeakyReLU) (None, 13, 5, 96) 0 \n", - " \n", - " batch_normalization_2 (Batc (None, 13, 5, 96) 384 \n", - " hNormalization) \n", - " \n", - " Max_3 (MaxPooling2D) (None, 7, 3, 96) 0 \n", - " \n", - " dropout_2 (Dropout) (None, 7, 3, 96) 0 \n", - " \n", - " conv_4 (Conv2D) (None, 7, 3, 128) 307328 \n", - " \n", - " leaky_re_lu_3 (LeakyReLU) (None, 7, 3, 128) 0 \n", - " \n", - " batch_normalization_3 (Batc (None, 7, 3, 128) 512 \n", - " hNormalization) \n", - " \n", - " Max_4 (MaxPooling2D) (None, 4, 2, 128) 0 \n", - " \n", - " dropout_3 (Dropout) (None, 4, 2, 128) 0 \n", - " \n", - " conv_5 (Conv2D) (None, 4, 2, 160) 512160 \n", - " \n", - " leaky_re_lu_4 (LeakyReLU) (None, 4, 2, 160) 0 \n", - " \n", - " batch_normalization_4 (Batc (None, 4, 2, 160) 640 \n", - " hNormalization) \n", - " \n", - " Max_5 (MaxPooling2D) (None, 2, 1, 160) 0 \n", - " \n", - " dropout_4 (Dropout) (None, 2, 1, 160) 0 \n", - " \n", - " conv_6 (Conv2D) (None, 2, 1, 192) 768192 \n", - " \n", - " leaky_re_lu_5 (LeakyReLU) (None, 2, 1, 192) 0 \n", - " \n", - " batch_normalization_5 (Batc (None, 2, 1, 192) 768 \n", - " hNormalization) \n", - " \n", - " Max_6 (MaxPooling2D) (None, 1, 1, 192) 0 \n", - " \n", - " dropout_5 (Dropout) (None, 1, 1, 192) 0 \n", - " \n", - " 2d_matrix (Flatten) (None, 192) 0 \n", - " \n", - " dense (Dense) (None, 192) 37056 \n", - " \n", - " leaky_re_lu_6 (LeakyReLU) (None, 192) 0 \n", - " \n", - " batch_normalization_6 (Batc (None, 192) 768 \n", - " hNormalization) \n", - " \n", - " dropout_6 (Dropout) (None, 192) 0 \n", - " \n", - " dense_1 (Dense) (None, 160) 30880 \n", - " \n", - " leaky_re_lu_7 (LeakyReLU) (None, 160) 0 \n", - " \n", - " batch_normalization_7 (Batc (None, 160) 640 \n", - " hNormalization) \n", - " \n", - " dropout_7 (Dropout) (None, 160) 0 \n", - " \n", - " main_output (Dense) (None, 1) 161 \n", - " \n", - "=================================================================\n", - "Total params: 1,865,665\n", - "Trainable params: 1,863,617\n", - "Non-trainable params: 2,048\n", - "_________________________________________________________________\n" - ] - } - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "vIEbdJxqydNm" - }, - "source": [ - "model_history = model.fit(\n", - " train_ohe, labels,\n", - " validation_split=0.05, epochs=10,\n", - " batch_size=32,\n", - " class_weight={0 : 1, 1 : 10}\n", - " )" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "e4uUTu-k0Y2S" - }, - "source": [ - "plot_history(model_history)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "WDSXwnZmymfK" - }, - "source": [ - "model.save(\"model.h5\")" - ], - "execution_count": null, - "outputs": [] - } - ] -} \ No newline at end of file diff --git a/code/machine_learning/train/README.md b/code/machine_learning/train/README.md index 2eb5163..ef527dd 100644 --- a/code/machine_learning/train/README.md +++ b/code/machine_learning/train/README.md @@ -1 +1,8 @@ -# Training the models \ No newline at end of file +# Training the models + +### CNN miRBind 2022 +This directory aggregates models based on the miRBind CNN architecture. It was presented in this miRBind paper (2022) https://doi.org/10.3390/genes13122323 + +[miRBind CNN architecture](CNN_miRBind_2022/miRBind_CNN_architecture.py) - containing definition of the CNN model architecture + +[miRBind CNN training with original parameters](CNN_miRBind_2022/miRBind_CNN_training_orig_parameters.py) - containing training of the CNN model with the original parameters described in the paper \ No newline at end of file