diff --git a/analysis/miRBind_CNN_retraining_orig_parameters/README.md b/analysis/miRBind_CNN_retraining_orig_parameters/README.md
new file mode 100644
index 0000000..de651fb
--- /dev/null
+++ b/analysis/miRBind_CNN_retraining_orig_parameters/README.md
@@ -0,0 +1,15 @@
+# miRBind CNN retraining with original parameters
+
+Run 
+```bash run_retraining.sh```
+to retrain the miRBind CNN as presented in the [miRBind paper](https://doi.org/10.3390/genes13122323) on Manakov 1:1 train dataset.
+The training is done with the original hyperparameters used in the paper.
+
+### Dependencies
+
+- python=3.8
+- tensorflow=2.13
+- matplotlib
+- numpy
+- pandas
+
diff --git a/analysis/miRBind_CNN_retraining_orig_parameters/run_retraining.sh b/analysis/miRBind_CNN_retraining_orig_parameters/run_retraining.sh
new file mode 100644
index 0000000..3afd191
--- /dev/null
+++ b/analysis/miRBind_CNN_retraining_orig_parameters/run_retraining.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+
+DATASET="../../data/chimeric_datasets/Manakov2022/AGO_eCLIP_Manakov2022_1_train_dataset.tsv"
+MODEL="../../models/miRBind_CNN_retrained_Manakov_1_orig_parameters.keras"
+CODE="../../code/machine_learning"
+
+mkdir -p encoded_dataset
+
+# encode dataset
+python $CODE/encode/binding_2D_matrix_encoder.py --i_file $DATASET --o_prefix encoded_dataset/AGO2_eCLIP_Manakov2022_1_train
+
+# train model
+python $CODE/train/CNN_miRBind_2022/miRBind_CNN_training_orig_parameters.py \
+--data encoded_dataset/AGO2_eCLIP_Manakov2022_1_train_dataset.npy \
+--labels encoded_dataset/AGO2_eCLIP_Manakov2022_1_train_labels.npy \
+--dataset_size 2524246 \
+--ratio 1 \
+--model $MODEL
\ No newline at end of file
diff --git a/code/machine_learning/encode/README.md b/code/machine_learning/encode/README.md
index 79fe152..697c03e 100644
--- a/code/machine_learning/encode/README.md
+++ b/code/machine_learning/encode/README.md
@@ -1 +1,17 @@
-# Encoding the dataset into inner representation
\ No newline at end of file
+# Encoding the dataset into inner representation
+
+### [Binding 2D matrix encoder](binding_2d_matrix_encoder.py)
+ The encoder is based on the "miRBind: A deep learning method for miRNA binding classification." (2022) https://doi.org/10.3390/genes13122323
+ with original python implementation here: https://github.com/ML-Bioinfo-CEITEC/miRBind
+
+Encodes miRNA and gene sequences into 2D-binding matrix.
+2D-binding matrix has shape (gene_max_len=50, miRNA_max_len=20, 1) and contains 1 for Watson-Crick interactions and 0 otherwise.
+
+Outputs npy file with encoded matrices and npy file with corresponding labels.
+
+#### Usage
+Run the script from the command line with the following syntax:
+
+
+```python binding_2d_matrix_encoder.py --i_file input_dataset_file.tsv --o_prefix output_prefix```
+
diff --git a/code/machine_learning/encode/binding_2D_matrix_encoder.py b/code/machine_learning/encode/binding_2D_matrix_encoder.py
index 70b6360..484e534 100644
--- a/code/machine_learning/encode/binding_2D_matrix_encoder.py
+++ b/code/machine_learning/encode/binding_2D_matrix_encoder.py
@@ -1,37 +1,102 @@
-class miRBindEncoder():
+import pandas as pd
+import numpy as np
+import argparse
+import time
+
+
+def binding_encoding(df, alphabet, tensor_dim=(50, 20, 1)):
+    """
+    Transform input sequence pairs to a binding matrix with corresponding labels.
+
+    Parameters:
+    - df: Pandas DataFrame with columns "noncodingRNA", "gene", "label"
+    - alphabet: dictionary with letter tuples as keys and 1s when they bind
+    - tensor_dim: 2D binding matrix shape
+
+    Output:
+    2D binding matrix, labels as np array
+    """
+    labels = df["label"].to_numpy()
+
+    # Initialize dot matrix with zeros
+    ohe_matrix_2d = np.zeros((len(df), *tensor_dim), dtype="float32")
+
+    df = df.reset_index(drop=True)
+
+    # Compile matrix with Watson-Crick interactions
+    for index, row in df.iterrows():
+        for bind_index, bind_nt in enumerate(row['gene'].upper()):
+            for ncrna_index, ncrna_nt in enumerate(row['noncodingRNA'].upper()):
+                if ncrna_index >= tensor_dim[1]:
+                    break
+                base_pairs = bind_nt + ncrna_nt
+                ohe_matrix_2d[index, bind_index, ncrna_index, 0] = alphabet.get(base_pairs, 0)
+
+    return ohe_matrix_2d, labels
+
+
+def encode_large_tsv_to_numpy(tsv_file_path, data_output_path, labels_output_path, chunk_size=10000):
+    """
+    Encode a large TSV file into a NumPy matrix using chunk processing.
+
+    Parameters:
+    - tsv_file_path: Path to the TSV file with dataset.
+    - data_output_path: Path to the output data .npy file.
+    - labels_output_path: Path to the output labels .npy file.
+    - chunk_size: Number of rows to process at a time.
     """
-    Based on Klimentová, Eva, et al. "miRBind: A deep learning method for miRNA binding classification." Genes 13.12 (2022): 2323. https://doi.org/10.3390/genes13122323.
-    Python implementation: https://github.com/ML-Bioinfo-CEITEC/miRBind
+    # Alphabet for Watson-Crick interactions
+    alphabet = {"AT": 1., "TA": 1., "GC": 1., "CG": 1.}
+    tensor_dim = (50, 20, 1)
+
+    # Get total number of rows in the dataset
+    num_rows = sum(len(df) for df in pd.read_csv(tsv_file_path, sep='\t', usecols=[0], chunksize=chunk_size))
+
+    # Determine the shape of the output arrays
+    labels_shape = (num_rows,)
+    data_shape = (num_rows, *tensor_dim)
+
+    # Create memory-mapped files
+    ohe_matrix_2d = np.memmap(data_output_path, dtype='float32', mode='w+', shape=data_shape)
+    labels = np.memmap(labels_output_path, dtype='float32', mode='w+', shape=labels_shape)
+
+    row_offset = 0
+
+    # Process each chunk
+    for chunk in pd.read_csv(tsv_file_path, sep='\t', chunksize=chunk_size):
+        encoded_data, encoded_labels = binding_encoding(chunk, alphabet, tensor_dim)
+
+        # Write the chunk's data and labels to the memory-mapped files
+        ohe_matrix_2d[row_offset:row_offset + len(chunk)] = encoded_data
+        labels[row_offset:row_offset + len(chunk)] = encoded_labels
+        row_offset += len(chunk)
+
+    # Flush changes to disk
+    ohe_matrix_2d.flush()
+    labels.flush()
+
+
+def main():
+    """
+    Based on "miRBind: A deep learning method for miRNA binding classification." Genes 13.12 (2022): 2323. https://doi.org/10.3390/genes13122323.
+    Original implementation: https://github.com/ML-Bioinfo-CEITEC/miRBind
 
     Encodes miRNA and gene sequences into 2D-binding matrix.
-    2D-binding matrix has shape (gene_max_len, miRNA_max_len, 1) and contains 1 for Watson-Crick interactions and 0 otherwise.
-    Returns array with shape (num_of_samples, gene_max_len, miRNA_max_len, 1).
+    2D-binding matrix has shape (gene_max_len=50, miRNA_max_len=20, 1) and contains 1 for Watson-Crick interactions and 0 otherwise.
     """
 
-    def __call__(self, df, miRNA_col="noncodingRNA", gene_col="gene", tensor_dim=(50, 20, 1)):
-        return self.binding_encoding(df, miRNA_col, gene_col, tensor_dim)
-    
-    def binding_encoding(self, df, miRNA_col, gene_col, tensor_dim):
-        """
-        fun encodes miRNAs and mRNAs in df into binding matrices
-        :param df: dataframe containing gene_col and miRNA_col columns
-        :param tensor_dim: output shape of the matrix. If sequences are longer than tensor_dim, they will be truncated.
-        :return: 2D binding matrix with shape (N, *tensor_dim)
-        """
-
-        # alphabet for watson-crick interactions.
-        alphabet = {"AT": 1., "TA": 1., "GC": 1., "CG": 1., "AU": 1., "UA": 1.}
-        # create empty main 2d matrix array
-        N = df.shape[0]  # number of samples in df
-        shape_matrix_2d = (N, *tensor_dim)  # 2d matrix shape
-        # initialize dot matrix with zeros
-        ohe_matrix_2d = np.zeros(shape_matrix_2d, dtype="float32")
-
-        # compile matrix with watson-crick interactions.
-        for index, row in df.iterrows():
-            for bind_index, bind_nt in enumerate(row[gene_col][:tensor_dim[0]].upper()):
-                for mirna_index, mirna_nt in enumerate(row[miRNA_col][:tensor_dim[1]].upper()):
-                    base_pairs = bind_nt + mirna_nt
-                    ohe_matrix_2d[index, bind_index, mirna_index, 0] = alphabet.get(base_pairs, 0)
-
-        return ohe_matrix_2d
\ No newline at end of file
+    parser = argparse.ArgumentParser(
+        description="Encode dataset to miRNA x target binding matrix. Outputs numpy file with matrices and and numpy file with corresponding labels. Expected columns of the dataset are 'noncodingRNA', 'gene' and 'label'")
+    parser.add_argument('-i', '--i_file', type=str, required=True, help="Input dataset file name")
+    parser.add_argument('-o', '--o_prefix', type=str, required=True, help="Output file name prefix")
+    args = parser.parse_args()
+
+    start = time.time()
+    encode_large_tsv_to_numpy(args.i_file, args.o_prefix + '_dataset.npy', args.o_prefix + '_labels.npy')
+    end = time.time()
+
+    print("Elapsed time: ", end - start, " s.")
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/code/machine_learning/train/CNN_miRBind_2022/miRBind_CNN_architecture.py b/code/machine_learning/train/CNN_miRBind_2022/miRBind_CNN_architecture.py
new file mode 100644
index 0000000..48d8bef
--- /dev/null
+++ b/code/machine_learning/train/CNN_miRBind_2022/miRBind_CNN_architecture.py
@@ -0,0 +1,60 @@
+import tensorflow as tf
+from tensorflow import keras as K
+from tensorflow.keras.optimizers import Adam
+from tensorflow.keras.layers import Input, Conv2D, LeakyReLU, BatchNormalization, MaxPooling2D, Dropout, Flatten, Dense
+
+
+class miRBind_CNN():
+    """
+    Build model architecture based on the CNN model presented in miRBind paper (2022) https://doi.org/10.3390/genes13122323
+    The default parameters are same as the ones used in the paper
+    """
+    def __init__(self, cnn_num = 6, kernel_size = 5, pool_size = 2, dropout_rate = 0.3, dense_num = 2):
+
+        x = Input(shape=(50,20,1), dtype='float32')
+        main_input = x
+
+        for cnn_i in range(cnn_num):
+            x = Conv2D(
+                filters=32 * (cnn_i + 1),
+                kernel_size=(kernel_size, kernel_size),
+                padding="same",
+                data_format="channels_last")(x)
+            x = LeakyReLU()(x)
+            x = BatchNormalization()(x)
+            x = MaxPooling2D(pool_size=(pool_size, pool_size), padding='same')(x)
+            x = Dropout(rate=dropout_rate)(x)
+
+        x = Flatten()(x)
+
+        for dense_i in range(dense_num):
+            neurons = 32 * (cnn_num - dense_i)
+            x = Dense(neurons)(x)
+            x = LeakyReLU()(x)
+            x = BatchNormalization()(x)
+            x = Dropout(rate=dropout_rate)(x)
+
+        main_output = Dense(1, activation='sigmoid')(x)
+
+        model = K.Model(inputs=[main_input], outputs=[main_output], name='miRBind_CNN')
+
+        self.model = model
+
+    def compile_model(self, lr=0.00152):
+        K.backend.clear_session()
+        model = self.model
+
+        opt = Adam(
+            learning_rate=lr,
+            beta_1=0.9,
+            beta_2=0.999,
+            epsilon=1e-07,
+            amsgrad=False,
+            name="Adam")
+
+        model.compile(
+            optimizer=opt,
+            loss='binary_crossentropy',
+            metrics=['accuracy']
+        )
+        return model
\ No newline at end of file
diff --git a/code/machine_learning/train/CNN_miRBind_2022/miRBind_CNN_training_orig_parameters.py b/code/machine_learning/train/CNN_miRBind_2022/miRBind_CNN_training_orig_parameters.py
new file mode 100644
index 0000000..6d8e1cc
--- /dev/null
+++ b/code/machine_learning/train/CNN_miRBind_2022/miRBind_CNN_training_orig_parameters.py
@@ -0,0 +1,139 @@
+import numpy as np
+import argparse
+import time
+import tensorflow as tf
+from tensorflow import keras as K
+import matplotlib.pyplot as plt
+from tensorflow.keras.utils import Sequence
+
+from miRBind_CNN_architecture import miRBind_CNN
+
+
+class DataGenerator(Sequence):
+    # preload the encoded numpy data
+    def __init__(self, data_path, labels_path, dataset_size, batch_size, validation_split=0.1,
+                 is_validation=False, shuffle=True):
+        # the dataset size is needed to properly load the numpy files
+        self.size = dataset_size
+
+        self.data = np.memmap(data_path, dtype='float32', mode='r', shape=(self.size, 50, 20, 1))
+        self.labels = np.memmap(labels_path, dtype='float32', mode='r', shape=(self.size,))
+        self.batch_size = batch_size
+        self.shuffle = shuffle
+
+        # Determine number of train and validation samples
+        self.validation_split = validation_split
+        self.num_samples = len(self.data)
+        self.num_validation_samples = int(self.num_samples * validation_split)
+        self.num_train_samples = self.num_samples - self.num_validation_samples
+
+        # Determine indices for validation and training
+        indices = np.arange(self.num_samples)
+        if shuffle:
+            np.random.shuffle(indices)
+
+        if is_validation:
+            self.indices = indices[self.num_train_samples:]
+        else:
+            self.indices = indices[:self.num_train_samples]
+
+        # Shuffle the data initially
+        self.on_epoch_end()
+
+    def __len__(self):
+        # Denotes the number of batches per epoch
+        return int(np.ceil(len(self.indices) / float(self.batch_size)))
+
+    def __getitem__(self, idx):
+        # Generate one batch of data
+        batch_indices = self.indices[idx * self.batch_size:(idx + 1) * self.batch_size]
+        batch_data = self.data[batch_indices]
+        batch_labels = self.labels[batch_indices]
+        return batch_data, batch_labels
+
+    def on_epoch_end(self):
+        # Updates indices after each epoch for shuffling
+        if self.shuffle:
+            np.random.shuffle(self.indices)
+
+
+def plot_history(history, ratio):
+    """
+    Plot history of the model training,
+    accuracy and loss of the training and validation set
+    """
+
+    acc = history.history['accuracy']
+    val_acc = history.history['val_accuracy']
+    loss = history.history['loss']
+    val_loss = history.history['val_loss']
+
+    epochs = range(1, len(acc) + 1)
+
+    plt.figure(figsize=(8, 6), dpi=80)
+
+    plt.plot(epochs, acc, 'bo', label='Training acc')
+    plt.plot(epochs, val_acc, 'b', label='Validation acc')
+    plt.title('Accuracy')
+    plt.legend()
+    plt.savefig(f"training_acc_1_{ratio}.jpg")
+
+    plt.figure()
+
+    plt.plot(epochs, loss, 'bo', label='Training loss')
+    plt.plot(epochs, val_loss, 'b', label='Validation loss')
+    plt.title('Loss')
+    plt.legend()
+    plt.savefig(f"training_loss_1_{ratio}.jpg")
+
+
+def train_model(data, labels, dataset_size, ratio, model_file, debug=False):
+    # set random state for reproducibility
+    np.random.seed(42)
+    tf.random.set_seed(42)
+    K.utils.set_random_seed(42)
+    # TODO still not fully reproducible? why?
+
+    train_data_gen = DataGenerator(data, labels, dataset_size, batch_size=32, validation_split=0.1,
+                                   is_validation=False)
+    val_data_gen = DataGenerator(data, labels, dataset_size, batch_size=32, validation_split=0.1,
+                                 is_validation=True)
+
+    model = miRBind_CNN().compile_model()
+    model_history = model.fit(
+        train_data_gen,
+        validation_data=val_data_gen,
+        epochs=10,
+        class_weight={0: 1, 1: ratio}
+    )
+
+    if debug:
+        plot_history(model_history, ratio)
+
+    model.save(model_file)
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Train CNN model on encoded miRNA x target binding matrix dataset")
+    parser.add_argument('--ratio', type=int, required=True, help="Ratio of pos:neg in the training dataset")
+    parser.add_argument('--data', type=str, required=True, help="File with the encoded dataset")
+    parser.add_argument('--labels', type=str, required=True, help="File with the dataset labels")
+    parser.add_argument('--dataset_size', type=int, required=True,
+                        help="Number of samples in the dataset. Needed to properly load the numpy files.")
+    parser.add_argument('--model', type=str, required=False, help="Filename to save the trained model")
+    parser.add_argument('--debug', type=bool, default=False, help="Set to True to output some plots about training")
+    args = parser.parse_args()
+
+    if args.model is None:
+        args.model = f"model_1_{args.ratio}.keras"
+
+    start = time.time()
+    train_model(data=args.data, labels=args.labels, dataset_size=args.dataset_size, ratio=args.ratio,
+                model_file=args.model, debug=args.debug)
+    end = time.time()
+
+    print("Elapsed time: ", end - start, " s.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/code/machine_learning/train/CNN_miRBind_2022/training.ipynb b/code/machine_learning/train/CNN_miRBind_2022/training.ipynb
deleted file mode 100644
index cf24ebf..0000000
--- a/code/machine_learning/train/CNN_miRBind_2022/training.ipynb
+++ /dev/null
@@ -1,442 +0,0 @@
-{
-  "nbformat": 4,
-  "nbformat_minor": 0,
-  "metadata": {
-    "colab": {
-      "name": "Training.ipynb",
-      "provenance": [],
-      "collapsed_sections": []
-    },
-    "kernelspec": {
-      "name": "python3",
-      "display_name": "Python 3"
-    },
-    "language_info": {
-      "name": "python"
-    }
-  },
-  "cells": [
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "T6BIHgU38o2f"
-      },
-      "source": [
-        "import pandas as pd\n",
-        "import numpy as np\n",
-        "import tensorflow as tf\n",
-        "from tensorflow import keras as K\n",
-        "import matplotlib.pyplot as plt\n",
-        "from matplotlib.pyplot import figure\n",
-        "from tensorflow.keras.layers import (\n",
-        "                                BatchNormalization, LeakyReLU,\n",
-        "                                Input, Dense, Conv2D,\n",
-        "                                MaxPooling2D, Flatten, Dropout)\n",
-        "from tensorflow.keras.optimizers import Adam"
-      ],
-      "execution_count": 1,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "EfIQCQj2r3fV",
-        "outputId": "26867200-18e8-4b4e-d34d-129c7203f694"
-      },
-      "source": [
-        "!wget https://raw.githubusercontent.com/ML-Bioinfo-CEITEC/miRBind/main/Datasets/train_set_1_10_CLASH2013_paper.tsv"
-      ],
-      "execution_count": 2,
-      "outputs": [
-        {
-          "output_type": "stream",
-          "name": "stdout",
-          "text": [
-            "--2022-04-24 17:22:50--  https://raw.githubusercontent.com/ML-Bioinfo-CEITEC/miRBind/main/Datasets/train_set_1_10_CLASH2013_paper.tsv\n",
-            "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n",
-            "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n",
-            "HTTP request sent, awaiting response... 200 OK\n",
-            "Length: 12518906 (12M) [text/plain]\n",
-            "Saving to: ‘train_set_1_10_CLASH2013_paper.tsv’\n",
-            "\n",
-            "train_set_1_10_CLAS 100%[===================>]  11.94M  --.-KB/s    in 0.09s   \n",
-            "\n",
-            "2022-04-24 17:22:51 (130 MB/s) - ‘train_set_1_10_CLASH2013_paper.tsv’ saved [12518906/12518906]\n",
-            "\n"
-          ]
-        }
-      ]
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "ALmjW7vd9u60"
-      },
-      "source": [
-        "def binding_encoding(df, tensor_dim=(50,20,1)):\n",
-        "    \"\"\"\n",
-        "    fun transform input database to numpy array.\n",
-        "    \n",
-        "    parameters:\n",
-        "    df = Pandas df with col names \"noncodingRNA\", \"gene\", \"label\"\n",
-        "    tensor_dim = 2d matrix shape\n",
-        "    \n",
-        "    output:\n",
-        "    2d dot matrix, labels as np array\n",
-        "    \"\"\"\n",
-        "    df.reset_index(inplace=True, drop=True)\n",
-        "\n",
-        "    # alphabet for watson-crick interactions.\n",
-        "    alphabet = {\"AT\": 1., \"TA\": 1., \"GC\": 1., \"CG\": 1.} \n",
-        "\n",
-        "    # labels to one hot encoding\n",
-        "    labels = df[\"label\"].to_numpy()\n",
-        "\n",
-        "    # create empty main 2d matrix array\n",
-        "    N = df.shape[0] # number of samples in df\n",
-        "    shape_matrix_2d = (N, *tensor_dim) # 2d matrix shape \n",
-        "    # initialize dot matrix with zeros\n",
-        "    ohe_matrix_2d = np.zeros(shape_matrix_2d, dtype=\"float32\")\n",
-        "\n",
-        "    # compile matrix with watson-crick interactions.\n",
-        "    for index, row in df.iterrows():        \n",
-        "        for bind_index, bind_nt in enumerate(row.gene.upper()):\n",
-        "        \n",
-        "            for ncrna_index, ncrna_nt in enumerate(row.noncodingRNA.upper()):\n",
-        "                if ncrna_index >= tensor_dim[1]:\n",
-        "                    break\n",
-        "                base_pairs = bind_nt + ncrna_nt\n",
-        "                ohe_matrix_2d[index, bind_index, ncrna_index, 0] = alphabet.get(base_pairs, 0)\n",
-        "    \n",
-        "\n",
-        "    return ohe_matrix_2d, labels"
-      ],
-      "execution_count": 3,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "CpoytQwIElkg"
-      },
-      "source": [
-        "def make_architecture():\n",
-        "    \"\"\"\n",
-        "    build model architecture\n",
-        "\n",
-        "    return a model object\n",
-        "    \"\"\"\n",
-        "    cnn_num = 6\n",
-        "    kernel_size = 5\n",
-        "    pool_size = 2\n",
-        "    dropout_rate = 0.3\n",
-        "    dense_num = 2\n",
-        "\n",
-        "    x = Input(shape=(50,20,1),\n",
-        "                       dtype='float32', name='main_input'\n",
-        "                       )\n",
-        "    main_input = x\n",
-        "\n",
-        "    for cnn_i in range(cnn_num):\n",
-        "        x = Conv2D(\n",
-        "            filters=32 * (cnn_i + 1),\n",
-        "            kernel_size=(kernel_size, kernel_size),\n",
-        "            padding=\"same\",\n",
-        "            data_format=\"channels_last\",\n",
-        "            name=\"conv_\" + str(cnn_i + 1))(x)\n",
-        "        x = LeakyReLU()(x)\n",
-        "        x = BatchNormalization()(x)\n",
-        "        x = MaxPooling2D(pool_size=(pool_size, pool_size), padding='same', name='Max_' + str(cnn_i + 1))(x)\n",
-        "        x = Dropout(rate=dropout_rate)(x)\n",
-        "\n",
-        "    x = Flatten(name='2d_matrix')(x)\n",
-        "\n",
-        "    for dense_i in range(dense_num):\n",
-        "        neurons = 32 * (cnn_num - dense_i)\n",
-        "        x = Dense(neurons)(x)\n",
-        "        x = LeakyReLU()(x)\n",
-        "        x = BatchNormalization()(x)\n",
-        "        x = Dropout(rate=dropout_rate)(x)\n",
-        "\n",
-        "    main_output = Dense(1, activation='sigmoid', name='main_output')(x)\n",
-        "\n",
-        "    model = K.Model(inputs=[main_input], outputs=[main_output], name='arch_00')\n",
-        "    \n",
-        "    return model"
-      ],
-      "execution_count": 4,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "oZ591qC0Femi"
-      },
-      "source": [
-        "def compile_model():\n",
-        "    K.backend.clear_session()\n",
-        "    model = make_architecture()\n",
-        "    \n",
-        "    opt = Adam(\n",
-        "        learning_rate=0.00152,\n",
-        "        beta_1=0.9,\n",
-        "        beta_2=0.999,\n",
-        "        epsilon=1e-07,\n",
-        "        amsgrad=False,\n",
-        "        name=\"Adam\")\n",
-        "\n",
-        "    model.compile(\n",
-        "        optimizer=opt,\n",
-        "        loss='binary_crossentropy',\n",
-        "        metrics=['accuracy']\n",
-        "        )\n",
-        "    return model"
-      ],
-      "execution_count": 5,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "wTB6K0lxyzcx"
-      },
-      "source": [
-        "def plot_history(history):\n",
-        "    \"\"\"\n",
-        "    plot history of the training of the model,\n",
-        "    accuracy and loss of the training and validation set\n",
-        "    \"\"\"\n",
-        "    \n",
-        "    acc = history.history['accuracy']\n",
-        "    val_acc = history.history['val_accuracy']\n",
-        "    loss = history.history['loss']\n",
-        "    val_loss = history.history['val_loss']\n",
-        "\n",
-        "    epochs = range(1, len(acc) + 1)\n",
-        "\n",
-        "    plt.figure(figsize=(8, 6), dpi=80)\n",
-        "\n",
-        "    plt.plot(epochs, acc, 'bo', label='Training acc')\n",
-        "    plt.plot(epochs, val_acc, 'b', label='Validation acc')\n",
-        "    plt.title('Accuracy')\n",
-        "    plt.legend()\n",
-        "    plt.figure()\n",
-        "\n",
-        "    plt.plot(epochs, loss, 'bo', label='Training loss')\n",
-        "    plt.plot(epochs, val_loss, 'b', label='Validation loss')\n",
-        "    plt.title('Loss')\n",
-        "    plt.legend()\n",
-        "    plt.show()"
-      ],
-      "execution_count": 6,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "3Bh2-XEPxmZf",
-        "outputId": "3dffc4d8-d135-46fd-dcf4-94edb75db0cb"
-      },
-      "source": [
-        "train_df = pd.read_csv('train_set_1_10_CLASH2013_paper.tsv', sep='\\t', names=['noncodingRNA', 'gene', 'label'], header=0)\n",
-        "# set random state for reproducibility\n",
-        "RANDOM_STATE = 42\n",
-        "np.random.seed(RANDOM_STATE)\n",
-        "train_df = train_df.sample(frac=1, random_state=RANDOM_STATE)\n",
-        "print(train_df.head())\n",
-        "ohe_data = binding_encoding(train_df)\n",
-        "train_ohe, labels = ohe_data\n",
-        "print(\"Number of training samples: \", train_df.shape[0])"
-      ],
-      "execution_count": 8,
-      "outputs": [
-        {
-          "output_type": "stream",
-          "name": "stdout",
-          "text": [
-            "                noncodingRNA  \\\n",
-            "45236   ACTGCATTATGAGCACTTAA   \n",
-            "168824  TATTGCACTTGTCCCGGCCT   \n",
-            "2591    AAAAGCTGGGTTGAGAGGGC   \n",
-            "76746   TCTCACACAGAAATCGCACC   \n",
-            "63277   TGAGGTAGTAGTTTGTGCTG   \n",
-            "\n",
-            "                                                     gene  label  \n",
-            "45236   GAGAAGAAATCTGGCTGGTTTGAGGGTTTCCTTTAGTTCACCCTCA...      0  \n",
-            "168824  GTAAATGTCTGTTTTTCATAATTGCTCTTTATATTGTGTGTTATCT...      0  \n",
-            "2591    GTACCCAGTAAAAACCAGAATGACCCATTGCCAGGACGCATCAAAG...      1  \n",
-            "76746   ACGTCGGCGCCATGCTCCAGGTACAGAGCCACATGTTGCTCCAGGC...      0  \n",
-            "63277   ACCAATGCCAGAGGAGCAACAGCGGCAACCTTTGGCACTGCATCCA...      0  \n",
-            "Number of training samples:  169312\n"
-          ]
-        }
-      ]
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "Lcc6OuabyVsK",
-        "outputId": "bb4e3fae-f08d-4d37-acfe-ff41c66e83eb"
-      },
-      "source": [
-        "model = compile_model()\n",
-        "model.summary()"
-      ],
-      "execution_count": 9,
-      "outputs": [
-        {
-          "output_type": "stream",
-          "name": "stdout",
-          "text": [
-            "Model: \"arch_00\"\n",
-            "_________________________________________________________________\n",
-            " Layer (type)                Output Shape              Param #   \n",
-            "=================================================================\n",
-            " main_input (InputLayer)     [(None, 50, 20, 1)]       0         \n",
-            "                                                                 \n",
-            " conv_1 (Conv2D)             (None, 50, 20, 32)        832       \n",
-            "                                                                 \n",
-            " leaky_re_lu (LeakyReLU)     (None, 50, 20, 32)        0         \n",
-            "                                                                 \n",
-            " batch_normalization (BatchN  (None, 50, 20, 32)       128       \n",
-            " ormalization)                                                   \n",
-            "                                                                 \n",
-            " Max_1 (MaxPooling2D)        (None, 25, 10, 32)        0         \n",
-            "                                                                 \n",
-            " dropout (Dropout)           (None, 25, 10, 32)        0         \n",
-            "                                                                 \n",
-            " conv_2 (Conv2D)             (None, 25, 10, 64)        51264     \n",
-            "                                                                 \n",
-            " leaky_re_lu_1 (LeakyReLU)   (None, 25, 10, 64)        0         \n",
-            "                                                                 \n",
-            " batch_normalization_1 (Batc  (None, 25, 10, 64)       256       \n",
-            " hNormalization)                                                 \n",
-            "                                                                 \n",
-            " Max_2 (MaxPooling2D)        (None, 13, 5, 64)         0         \n",
-            "                                                                 \n",
-            " dropout_1 (Dropout)         (None, 13, 5, 64)         0         \n",
-            "                                                                 \n",
-            " conv_3 (Conv2D)             (None, 13, 5, 96)         153696    \n",
-            "                                                                 \n",
-            " leaky_re_lu_2 (LeakyReLU)   (None, 13, 5, 96)         0         \n",
-            "                                                                 \n",
-            " batch_normalization_2 (Batc  (None, 13, 5, 96)        384       \n",
-            " hNormalization)                                                 \n",
-            "                                                                 \n",
-            " Max_3 (MaxPooling2D)        (None, 7, 3, 96)          0         \n",
-            "                                                                 \n",
-            " dropout_2 (Dropout)         (None, 7, 3, 96)          0         \n",
-            "                                                                 \n",
-            " conv_4 (Conv2D)             (None, 7, 3, 128)         307328    \n",
-            "                                                                 \n",
-            " leaky_re_lu_3 (LeakyReLU)   (None, 7, 3, 128)         0         \n",
-            "                                                                 \n",
-            " batch_normalization_3 (Batc  (None, 7, 3, 128)        512       \n",
-            " hNormalization)                                                 \n",
-            "                                                                 \n",
-            " Max_4 (MaxPooling2D)        (None, 4, 2, 128)         0         \n",
-            "                                                                 \n",
-            " dropout_3 (Dropout)         (None, 4, 2, 128)         0         \n",
-            "                                                                 \n",
-            " conv_5 (Conv2D)             (None, 4, 2, 160)         512160    \n",
-            "                                                                 \n",
-            " leaky_re_lu_4 (LeakyReLU)   (None, 4, 2, 160)         0         \n",
-            "                                                                 \n",
-            " batch_normalization_4 (Batc  (None, 4, 2, 160)        640       \n",
-            " hNormalization)                                                 \n",
-            "                                                                 \n",
-            " Max_5 (MaxPooling2D)        (None, 2, 1, 160)         0         \n",
-            "                                                                 \n",
-            " dropout_4 (Dropout)         (None, 2, 1, 160)         0         \n",
-            "                                                                 \n",
-            " conv_6 (Conv2D)             (None, 2, 1, 192)         768192    \n",
-            "                                                                 \n",
-            " leaky_re_lu_5 (LeakyReLU)   (None, 2, 1, 192)         0         \n",
-            "                                                                 \n",
-            " batch_normalization_5 (Batc  (None, 2, 1, 192)        768       \n",
-            " hNormalization)                                                 \n",
-            "                                                                 \n",
-            " Max_6 (MaxPooling2D)        (None, 1, 1, 192)         0         \n",
-            "                                                                 \n",
-            " dropout_5 (Dropout)         (None, 1, 1, 192)         0         \n",
-            "                                                                 \n",
-            " 2d_matrix (Flatten)         (None, 192)               0         \n",
-            "                                                                 \n",
-            " dense (Dense)               (None, 192)               37056     \n",
-            "                                                                 \n",
-            " leaky_re_lu_6 (LeakyReLU)   (None, 192)               0         \n",
-            "                                                                 \n",
-            " batch_normalization_6 (Batc  (None, 192)              768       \n",
-            " hNormalization)                                                 \n",
-            "                                                                 \n",
-            " dropout_6 (Dropout)         (None, 192)               0         \n",
-            "                                                                 \n",
-            " dense_1 (Dense)             (None, 160)               30880     \n",
-            "                                                                 \n",
-            " leaky_re_lu_7 (LeakyReLU)   (None, 160)               0         \n",
-            "                                                                 \n",
-            " batch_normalization_7 (Batc  (None, 160)              640       \n",
-            " hNormalization)                                                 \n",
-            "                                                                 \n",
-            " dropout_7 (Dropout)         (None, 160)               0         \n",
-            "                                                                 \n",
-            " main_output (Dense)         (None, 1)                 161       \n",
-            "                                                                 \n",
-            "=================================================================\n",
-            "Total params: 1,865,665\n",
-            "Trainable params: 1,863,617\n",
-            "Non-trainable params: 2,048\n",
-            "_________________________________________________________________\n"
-          ]
-        }
-      ]
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "vIEbdJxqydNm"
-      },
-      "source": [
-        "model_history = model.fit(\n",
-        "    train_ohe, labels,\n",
-        "    validation_split=0.05, epochs=10,\n",
-        "    batch_size=32,\n",
-        "    class_weight={0 : 1, 1 : 10}\n",
-        "    )"
-      ],
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "e4uUTu-k0Y2S"
-      },
-      "source": [
-        "plot_history(model_history)"
-      ],
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "WDSXwnZmymfK"
-      },
-      "source": [
-        "model.save(\"model.h5\")"
-      ],
-      "execution_count": null,
-      "outputs": []
-    }
-  ]
-}
\ No newline at end of file
diff --git a/code/machine_learning/train/README.md b/code/machine_learning/train/README.md
index 2eb5163..ef527dd 100644
--- a/code/machine_learning/train/README.md
+++ b/code/machine_learning/train/README.md
@@ -1 +1,8 @@
-# Training the models
\ No newline at end of file
+# Training the models
+
+### CNN miRBind 2022
+This directory aggregates models based on the miRBind CNN architecture. It was presented in this miRBind paper (2022) https://doi.org/10.3390/genes13122323
+
+[miRBind CNN architecture](CNN_miRBind_2022/miRBind_CNN_architecture.py) - containing definition of the CNN model architecture
+
+[miRBind CNN training with original parameters](CNN_miRBind_2022/miRBind_CNN_training_orig_parameters.py) - containing training of the CNN model with the original parameters described in the paper
\ No newline at end of file