BioGeMT · evaklimentova · Oct 2, 2024 · Oct 15, 2024 · Oct 22, 2024
diff --git a/analysis/miRBind_CNN_retraining_orig_parameters/README.md b/analysis/miRBind_CNN_retraining_orig_parameters/README.md
@@ -0,0 +1,15 @@
+# miRBind CNN retraining with original parameters
+
+Run 
+```bash run_retraining.sh```
+to retrain the miRBind CNN as presented in the [miRBind paper](https://doi.org/10.3390/genes13122323) on Manakov 1:1 train dataset.
+The training is done with the original hyperparameters used in the paper.
+
+### Dependencies
+
+- python=3.8
+- tensorflow=2.13
+- matplotlib
+- numpy
+- pandas
+
diff --git a/analysis/miRBind_CNN_retraining_orig_parameters/run_retraining.sh b/analysis/miRBind_CNN_retraining_orig_parameters/run_retraining.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+
+DATASET="../../data/chimeric_datasets/Manakov2022/AGO_eCLIP_Manakov2022_1_train_dataset.tsv"
+MODEL="../../models/miRBind_CNN_retrained_Manakov_1_orig_parameters.keras"
+CODE="../../code/machine_learning"
+
+mkdir -p encoded_dataset
+
+# encode dataset
+python $CODE/encode/binding_2D_matrix_encoder.py --i_file $DATASET --o_prefix encoded_dataset/AGO2_eCLIP_Manakov2022_1_train
+
+# train model
+python $CODE/train/CNN_miRBind_2022/miRBind_CNN_training_orig_parameters.py \
+--data encoded_dataset/AGO2_eCLIP_Manakov2022_1_train_dataset.npy \
+--labels encoded_dataset/AGO2_eCLIP_Manakov2022_1_train_labels.npy \
+--dataset_size 2524246 \
+--ratio 1 \
+--model $MODEL
diff --git a/code/machine_learning/encode/README.md b/code/machine_learning/encode/README.md
@@ -1 +1,17 @@
-# Encoding the dataset into inner representation
+# Encoding the dataset into inner representation
+
+### [Binding 2D matrix encoder](binding_2d_matrix_encoder.py)
+ The encoder is based on the "miRBind: A deep learning method for miRNA binding classification." (2022) https://doi.org/10.3390/genes13122323
+ with original python implementation here: https://github.com/ML-Bioinfo-CEITEC/miRBind
+
+Encodes miRNA and gene sequences into 2D-binding matrix.
+2D-binding matrix has shape (gene_max_len=50, miRNA_max_len=20, 1) and contains 1 for Watson-Crick interactions and 0 otherwise.
+
+Outputs npy file with encoded matrices and npy file with corresponding labels.
+
+#### Usage
+Run the script from the command line with the following syntax:
+
+
+```python binding_2d_matrix_encoder.py --i_file input_dataset_file.tsv --o_prefix output_prefix```
+
diff --git a/code/machine_learning/encode/binding_2D_matrix_encoder.py b/code/machine_learning/encode/binding_2D_matrix_encoder.py
@@ -1,37 +1,102 @@
-class miRBindEncoder():
+import pandas as pd
+import numpy as np
+import argparse
+import time
+
+
+def binding_encoding(df, alphabet, tensor_dim=(50, 20, 1)):
+    """
+    Transform input sequence pairs to a binding matrix with corresponding labels.
+
+    Parameters:
+    - df: Pandas DataFrame with columns "noncodingRNA", "gene", "label"
+    - alphabet: dictionary with letter tuples as keys and 1s when they bind
+    - tensor_dim: 2D binding matrix shape
+
+    Output:
+    2D binding matrix, labels as np array
+    """
+    labels = df["label"].to_numpy()
+
+    # Initialize dot matrix with zeros
+    ohe_matrix_2d = np.zeros((len(df), *tensor_dim), dtype="float32")
+
+    df = df.reset_index(drop=True)
+
+    # Compile matrix with Watson-Crick interactions
+    for index, row in df.iterrows():
+        for bind_index, bind_nt in enumerate(row['gene'].upper()):
+            for ncrna_index, ncrna_nt in enumerate(row['noncodingRNA'].upper()):
+                if ncrna_index >= tensor_dim[1]:
+                    break
+                base_pairs = bind_nt + ncrna_nt
+                ohe_matrix_2d[index, bind_index, ncrna_index, 0] = alphabet.get(base_pairs, 0)
+
+    return ohe_matrix_2d, labels
+
+
+def encode_large_tsv_to_numpy(tsv_file_path, data_output_path, labels_output_path, chunk_size=10000):
+    """
+    Encode a large TSV file into a NumPy matrix using chunk processing.
+
+    Parameters:
+    - tsv_file_path: Path to the TSV file with dataset.
+    - data_output_path: Path to the output data .npy file.
+    - labels_output_path: Path to the output labels .npy file.
+    - chunk_size: Number of rows to process at a time.
     """
-    Based on Klimentová, Eva, et al. "miRBind: A deep learning method for miRNA binding classification." Genes 13.12 (2022): 2323. https://doi.org/10.3390/genes13122323.
-    Python implementation: https://github.com/ML-Bioinfo-CEITEC/miRBind
+    # Alphabet for Watson-Crick interactions
+    alphabet = {"AT": 1., "TA": 1., "GC": 1., "CG": 1.}
+    tensor_dim = (50, 20, 1)
+
+    # Get total number of rows in the dataset
+    num_rows = sum(len(df) for df in pd.read_csv(tsv_file_path, sep='\t', usecols=[0], chunksize=chunk_size))
+
+    # Determine the shape of the output arrays
+    labels_shape = (num_rows,)
+    data_shape = (num_rows, *tensor_dim)
+
+    # Create memory-mapped files
+    ohe_matrix_2d = np.memmap(data_output_path, dtype='float32', mode='w+', shape=data_shape)
+    labels = np.memmap(labels_output_path, dtype='float32', mode='w+', shape=labels_shape)
+
+    row_offset = 0
+
+    # Process each chunk
+    for chunk in pd.read_csv(tsv_file_path, sep='\t', chunksize=chunk_size):
+        encoded_data, encoded_labels = binding_encoding(chunk, alphabet, tensor_dim)
+
+        # Write the chunk's data and labels to the memory-mapped files
+        ohe_matrix_2d[row_offset:row_offset + len(chunk)] = encoded_data
+        labels[row_offset:row_offset + len(chunk)] = encoded_labels
+        row_offset += len(chunk)
+
+    # Flush changes to disk
+    ohe_matrix_2d.flush()
+    labels.flush()
+
+
+def main():
+    """
+    Based on "miRBind: A deep learning method for miRNA binding classification." Genes 13.12 (2022): 2323. https://doi.org/10.3390/genes13122323.
+    Original implementation: https://github.com/ML-Bioinfo-CEITEC/miRBind
 
     Encodes miRNA and gene sequences into 2D-binding matrix.
-    2D-binding matrix has shape (gene_max_len, miRNA_max_len, 1) and contains 1 for Watson-Crick interactions and 0 otherwise.
-    Returns array with shape (num_of_samples, gene_max_len, miRNA_max_len, 1).
+    2D-binding matrix has shape (gene_max_len=50, miRNA_max_len=20, 1) and contains 1 for Watson-Crick interactions and 0 otherwise.
     """
 
-    def __call__(self, df, miRNA_col="noncodingRNA", gene_col="gene", tensor_dim=(50, 20, 1)):
-        return self.binding_encoding(df, miRNA_col, gene_col, tensor_dim)
-
-    def binding_encoding(self, df, miRNA_col, gene_col, tensor_dim):
-        """
-        fun encodes miRNAs and mRNAs in df into binding matrices
-        :param df: dataframe containing gene_col and miRNA_col columns
-        :param tensor_dim: output shape of the matrix. If sequences are longer than tensor_dim, they will be truncated.
-        :return: 2D binding matrix with shape (N, *tensor_dim)
-        """
-
-        # alphabet for watson-crick interactions.
-        alphabet = {"AT": 1., "TA": 1., "GC": 1., "CG": 1., "AU": 1., "UA": 1.}
-        # create empty main 2d matrix array
-        N = df.shape[0]  # number of samples in df
-        shape_matrix_2d = (N, *tensor_dim)  # 2d matrix shape
-        # initialize dot matrix with zeros
-        ohe_matrix_2d = np.zeros(shape_matrix_2d, dtype="float32")
-
-        # compile matrix with watson-crick interactions.
-        for index, row in df.iterrows():
-            for bind_index, bind_nt in enumerate(row[gene_col][:tensor_dim[0]].upper()):
-                for mirna_index, mirna_nt in enumerate(row[miRNA_col][:tensor_dim[1]].upper()):
-                    base_pairs = bind_nt + mirna_nt
-                    ohe_matrix_2d[index, bind_index, mirna_index, 0] = alphabet.get(base_pairs, 0)
-
-        return ohe_matrix_2d
+    parser = argparse.ArgumentParser(
+        description="Encode dataset to miRNA x target binding matrix. Outputs numpy file with matrices and and numpy file with corresponding labels. Expected columns of the dataset are 'noncodingRNA', 'gene' and 'label'")
+    parser.add_argument('-i', '--i_file', type=str, required=True, help="Input dataset file name")
+    parser.add_argument('-o', '--o_prefix', type=str, required=True, help="Output file name prefix")
+    args = parser.parse_args()
+
+    start = time.time()
+    encode_large_tsv_to_numpy(args.i_file, args.o_prefix + '_dataset.npy', args.o_prefix + '_labels.npy')
+    end = time.time()
+
+    print("Elapsed time: ", end - start, " s.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/code/machine_learning/train/CNN_miRBind_2022/miRBind_CNN_architecture.py b/code/machine_learning/train/CNN_miRBind_2022/miRBind_CNN_architecture.py
@@ -0,0 +1,60 @@
+import tensorflow as tf
+from tensorflow import keras as K
+from tensorflow.keras.optimizers import Adam
+from tensorflow.keras.layers import Input, Conv2D, LeakyReLU, BatchNormalization, MaxPooling2D, Dropout, Flatten, Dense
+
+
+class miRBind_CNN():
+    """
+    Build model architecture based on the CNN model presented in miRBind paper (2022) https://doi.org/10.3390/genes13122323
+    The default parameters are same as the ones used in the paper
+    """
+    def __init__(self, cnn_num = 6, kernel_size = 5, pool_size = 2, dropout_rate = 0.3, dense_num = 2):
+
+        x = Input(shape=(50,20,1), dtype='float32')
+        main_input = x
+
+        for cnn_i in range(cnn_num):
+            x = Conv2D(
+                filters=32 * (cnn_i + 1),
+                kernel_size=(kernel_size, kernel_size),
+                padding="same",
+                data_format="channels_last")(x)
+            x = LeakyReLU()(x)
+            x = BatchNormalization()(x)
+            x = MaxPooling2D(pool_size=(pool_size, pool_size), padding='same')(x)
+            x = Dropout(rate=dropout_rate)(x)
+
+        x = Flatten()(x)
+
+        for dense_i in range(dense_num):
+            neurons = 32 * (cnn_num - dense_i)
+            x = Dense(neurons)(x)
+            x = LeakyReLU()(x)
+            x = BatchNormalization()(x)
+            x = Dropout(rate=dropout_rate)(x)
+
+        main_output = Dense(1, activation='sigmoid')(x)
+
+        model = K.Model(inputs=[main_input], outputs=[main_output], name='miRBind_CNN')
+
+        self.model = model
+
+    def compile_model(self, lr=0.00152):
+        K.backend.clear_session()
+        model = self.model
+
+        opt = Adam(
+            learning_rate=lr,
+            beta_1=0.9,
+            beta_2=0.999,
+            epsilon=1e-07,
+            amsgrad=False,
+            name="Adam")
+
+        model.compile(
+            optimizer=opt,
+            loss='binary_crossentropy',
+            metrics=['accuracy']
+        )
+        return model
diff --git a/code/machine_learning/train/CNN_miRBind_2022/miRBind_CNN_training_orig_parameters.py b/code/machine_learning/train/CNN_miRBind_2022/miRBind_CNN_training_orig_parameters.py
@@ -0,0 +1,139 @@
+import numpy as np
+import argparse
+import time
+import tensorflow as tf
+from tensorflow import keras as K
+import matplotlib.pyplot as plt
+from tensorflow.keras.utils import Sequence
+
+from miRBind_CNN_architecture import miRBind_CNN
+
+
+class DataGenerator(Sequence):
+    # preload the encoded numpy data
+    def __init__(self, data_path, labels_path, dataset_size, batch_size, validation_split=0.1,
+                 is_validation=False, shuffle=True):
+        # the dataset size is needed to properly load the numpy files
+        self.size = dataset_size
+
+        self.data = np.memmap(data_path, dtype='float32', mode='r', shape=(self.size, 50, 20, 1))
+        self.labels = np.memmap(labels_path, dtype='float32', mode='r', shape=(self.size,))
+        self.batch_size = batch_size
+        self.shuffle = shuffle
+
+        # Determine number of train and validation samples
+        self.validation_split = validation_split
+        self.num_samples = len(self.data)
+        self.num_validation_samples = int(self.num_samples * validation_split)
+        self.num_train_samples = self.num_samples - self.num_validation_samples
+
+        # Determine indices for validation and training
+        indices = np.arange(self.num_samples)
+        if shuffle:
+            np.random.shuffle(indices)
+
+        if is_validation:
+            self.indices = indices[self.num_train_samples:]
+        else:
+            self.indices = indices[:self.num_train_samples]
+
+        # Shuffle the data initially
+        self.on_epoch_end()
+
+    def __len__(self):
+        # Denotes the number of batches per epoch
+        return int(np.ceil(len(self.indices) / float(self.batch_size)))
+
+    def __getitem__(self, idx):
+        # Generate one batch of data
+        batch_indices = self.indices[idx * self.batch_size:(idx + 1) * self.batch_size]
+        batch_data = self.data[batch_indices]
+        batch_labels = self.labels[batch_indices]
+        return batch_data, batch_labels
+
+    def on_epoch_end(self):
+        # Updates indices after each epoch for shuffling
+        if self.shuffle:
+            np.random.shuffle(self.indices)
+
+
+def plot_history(history, ratio):
+    """
+    Plot history of the model training,
+    accuracy and loss of the training and validation set
+    """
+
+    acc = history.history['accuracy']
+    val_acc = history.history['val_accuracy']
+    loss = history.history['loss']
+    val_loss = history.history['val_loss']
+
+    epochs = range(1, len(acc) + 1)
+
+    plt.figure(figsize=(8, 6), dpi=80)
+
+    plt.plot(epochs, acc, 'bo', label='Training acc')
+    plt.plot(epochs, val_acc, 'b', label='Validation acc')
+    plt.title('Accuracy')
+    plt.legend()
+    plt.savefig(f"training_acc_1_{ratio}.jpg")
+
+    plt.figure()
+
+    plt.plot(epochs, loss, 'bo', label='Training loss')
+    plt.plot(epochs, val_loss, 'b', label='Validation loss')
+    plt.title('Loss')
+    plt.legend()
+    plt.savefig(f"training_loss_1_{ratio}.jpg")
+
+
+def train_model(data, labels, dataset_size, ratio, model_file, debug=False):
+    # set random state for reproducibility
+    np.random.seed(42)
+    tf.random.set_seed(42)
+    K.utils.set_random_seed(42)
+    # TODO still not fully reproducible? why?
+
+    train_data_gen = DataGenerator(data, labels, dataset_size, batch_size=32, validation_split=0.1,
+                                   is_validation=False)
+    val_data_gen = DataGenerator(data, labels, dataset_size, batch_size=32, validation_split=0.1,
+                                 is_validation=True)
+
+    model = miRBind_CNN().compile_model()
+    model_history = model.fit(
+        train_data_gen,
+        validation_data=val_data_gen,
+        epochs=10,
+        class_weight={0: 1, 1: ratio}
+    )
+
+    if debug:
+        plot_history(model_history, ratio)
+
+    model.save(model_file)
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Train CNN model on encoded miRNA x target binding matrix dataset")
+    parser.add_argument('--ratio', type=int, required=True, help="Ratio of pos:neg in the training dataset")
+    parser.add_argument('--data', type=str, required=True, help="File with the encoded dataset")
+    parser.add_argument('--labels', type=str, required=True, help="File with the dataset labels")
+    parser.add_argument('--dataset_size', type=int, required=True,
+                        help="Number of samples in the dataset. Needed to properly load the numpy files.")
+    parser.add_argument('--model', type=str, required=False, help="Filename to save the trained model")
+    parser.add_argument('--debug', type=bool, default=False, help="Set to True to output some plots about training")
+    args = parser.parse_args()
+
+    if args.model is None:
+        args.model = f"model_1_{args.ratio}.keras"
+
+    start = time.time()
+    train_model(data=args.data, labels=args.labels, dataset_size=args.dataset_size, ratio=args.ratio,
+                model_file=args.model, debug=args.debug)
+    end = time.time()
+
+    print("Elapsed time: ", end - start, " s.")
+
+
+if __name__ == "__main__":
+    main()