Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions analysis/miRBind_CNN_retraining_orig_parameters/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# miRBind CNN retraining with original parameters

Run
```bash run_retraining.sh```
to retrain the miRBind CNN as presented in the [miRBind paper](https://doi.org/10.3390/genes13122323) on Manakov 1:1 train dataset.
The training is done with the original hyperparameters used in the paper.

### Dependencies

- python=3.8
- tensorflow=2.13
- matplotlib
- numpy
- pandas

18 changes: 18 additions & 0 deletions analysis/miRBind_CNN_retraining_orig_parameters/run_retraining.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
#!/bin/bash

DATASET="../../data/chimeric_datasets/Manakov2022/AGO_eCLIP_Manakov2022_1_train_dataset.tsv"
MODEL="../../models/miRBind_CNN_retrained_Manakov_1_orig_parameters.keras"
CODE="../../code/machine_learning"

mkdir -p encoded_dataset

# encode dataset
python $CODE/encode/binding_2D_matrix_encoder.py --i_file $DATASET --o_prefix encoded_dataset/AGO2_eCLIP_Manakov2022_1_train

# train model
python $CODE/train/CNN_miRBind_2022/miRBind_CNN_training_orig_parameters.py \
--data encoded_dataset/AGO2_eCLIP_Manakov2022_1_train_dataset.npy \
--labels encoded_dataset/AGO2_eCLIP_Manakov2022_1_train_labels.npy \
--dataset_size 2524246 \
--ratio 1 \
--model $MODEL
18 changes: 17 additions & 1 deletion code/machine_learning/encode/README.md
Original file line number Diff line number Diff line change
@@ -1 +1,17 @@
# Encoding the dataset into inner representation
# Encoding the dataset into inner representation

### [Binding 2D matrix encoder](binding_2d_matrix_encoder.py)
The encoder is based on the "miRBind: A deep learning method for miRNA binding classification." (2022) https://doi.org/10.3390/genes13122323
with original python implementation here: https://github.com/ML-Bioinfo-CEITEC/miRBind

Encodes miRNA and gene sequences into 2D-binding matrix.
2D-binding matrix has shape (gene_max_len=50, miRNA_max_len=20, 1) and contains 1 for Watson-Crick interactions and 0 otherwise.

Outputs npy file with encoded matrices and npy file with corresponding labels.

#### Usage
Run the script from the command line with the following syntax:


```python binding_2d_matrix_encoder.py --i_file input_dataset_file.tsv --o_prefix output_prefix```

129 changes: 97 additions & 32 deletions code/machine_learning/encode/binding_2D_matrix_encoder.py
Original file line number Diff line number Diff line change
@@ -1,37 +1,102 @@
class miRBindEncoder():
import pandas as pd
import numpy as np
import argparse
import time


def binding_encoding(df, alphabet, tensor_dim=(50, 20, 1)):
"""
Transform input sequence pairs to a binding matrix with corresponding labels.

Parameters:
- df: Pandas DataFrame with columns "noncodingRNA", "gene", "label"
- alphabet: dictionary with letter tuples as keys and 1s when they bind
- tensor_dim: 2D binding matrix shape

Output:
2D binding matrix, labels as np array
"""
labels = df["label"].to_numpy()

# Initialize dot matrix with zeros
ohe_matrix_2d = np.zeros((len(df), *tensor_dim), dtype="float32")

df = df.reset_index(drop=True)

# Compile matrix with Watson-Crick interactions
for index, row in df.iterrows():
for bind_index, bind_nt in enumerate(row['gene'].upper()):
for ncrna_index, ncrna_nt in enumerate(row['noncodingRNA'].upper()):
if ncrna_index >= tensor_dim[1]:
break
base_pairs = bind_nt + ncrna_nt
ohe_matrix_2d[index, bind_index, ncrna_index, 0] = alphabet.get(base_pairs, 0)

return ohe_matrix_2d, labels


def encode_large_tsv_to_numpy(tsv_file_path, data_output_path, labels_output_path, chunk_size=10000):
"""
Encode a large TSV file into a NumPy matrix using chunk processing.

Parameters:
- tsv_file_path: Path to the TSV file with dataset.
- data_output_path: Path to the output data .npy file.
- labels_output_path: Path to the output labels .npy file.
- chunk_size: Number of rows to process at a time.
"""
Based on Klimentová, Eva, et al. "miRBind: A deep learning method for miRNA binding classification." Genes 13.12 (2022): 2323. https://doi.org/10.3390/genes13122323.
Python implementation: https://github.com/ML-Bioinfo-CEITEC/miRBind
# Alphabet for Watson-Crick interactions
alphabet = {"AT": 1., "TA": 1., "GC": 1., "CG": 1.}
tensor_dim = (50, 20, 1)

# Get total number of rows in the dataset
num_rows = sum(len(df) for df in pd.read_csv(tsv_file_path, sep='\t', usecols=[0], chunksize=chunk_size))

# Determine the shape of the output arrays
labels_shape = (num_rows,)
data_shape = (num_rows, *tensor_dim)

# Create memory-mapped files
ohe_matrix_2d = np.memmap(data_output_path, dtype='float32', mode='w+', shape=data_shape)
labels = np.memmap(labels_output_path, dtype='float32', mode='w+', shape=labels_shape)

row_offset = 0

# Process each chunk
for chunk in pd.read_csv(tsv_file_path, sep='\t', chunksize=chunk_size):
encoded_data, encoded_labels = binding_encoding(chunk, alphabet, tensor_dim)

# Write the chunk's data and labels to the memory-mapped files
ohe_matrix_2d[row_offset:row_offset + len(chunk)] = encoded_data
labels[row_offset:row_offset + len(chunk)] = encoded_labels
row_offset += len(chunk)

# Flush changes to disk
ohe_matrix_2d.flush()
labels.flush()


def main():
"""
Based on "miRBind: A deep learning method for miRNA binding classification." Genes 13.12 (2022): 2323. https://doi.org/10.3390/genes13122323.
Original implementation: https://github.com/ML-Bioinfo-CEITEC/miRBind

Encodes miRNA and gene sequences into 2D-binding matrix.
2D-binding matrix has shape (gene_max_len, miRNA_max_len, 1) and contains 1 for Watson-Crick interactions and 0 otherwise.
Returns array with shape (num_of_samples, gene_max_len, miRNA_max_len, 1).
2D-binding matrix has shape (gene_max_len=50, miRNA_max_len=20, 1) and contains 1 for Watson-Crick interactions and 0 otherwise.
"""

def __call__(self, df, miRNA_col="noncodingRNA", gene_col="gene", tensor_dim=(50, 20, 1)):
return self.binding_encoding(df, miRNA_col, gene_col, tensor_dim)

def binding_encoding(self, df, miRNA_col, gene_col, tensor_dim):
"""
fun encodes miRNAs and mRNAs in df into binding matrices
:param df: dataframe containing gene_col and miRNA_col columns
:param tensor_dim: output shape of the matrix. If sequences are longer than tensor_dim, they will be truncated.
:return: 2D binding matrix with shape (N, *tensor_dim)
"""

# alphabet for watson-crick interactions.
alphabet = {"AT": 1., "TA": 1., "GC": 1., "CG": 1., "AU": 1., "UA": 1.}
# create empty main 2d matrix array
N = df.shape[0] # number of samples in df
shape_matrix_2d = (N, *tensor_dim) # 2d matrix shape
# initialize dot matrix with zeros
ohe_matrix_2d = np.zeros(shape_matrix_2d, dtype="float32")

# compile matrix with watson-crick interactions.
for index, row in df.iterrows():
for bind_index, bind_nt in enumerate(row[gene_col][:tensor_dim[0]].upper()):
for mirna_index, mirna_nt in enumerate(row[miRNA_col][:tensor_dim[1]].upper()):
base_pairs = bind_nt + mirna_nt
ohe_matrix_2d[index, bind_index, mirna_index, 0] = alphabet.get(base_pairs, 0)

return ohe_matrix_2d
parser = argparse.ArgumentParser(
description="Encode dataset to miRNA x target binding matrix. Outputs numpy file with matrices and and numpy file with corresponding labels. Expected columns of the dataset are 'noncodingRNA', 'gene' and 'label'")
parser.add_argument('-i', '--i_file', type=str, required=True, help="Input dataset file name")
parser.add_argument('-o', '--o_prefix', type=str, required=True, help="Output file name prefix")
args = parser.parse_args()

start = time.time()
encode_large_tsv_to_numpy(args.i_file, args.o_prefix + '_dataset.npy', args.o_prefix + '_labels.npy')
end = time.time()

print("Elapsed time: ", end - start, " s.")


if __name__ == "__main__":
main()
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
import tensorflow as tf
from tensorflow import keras as K
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Input, Conv2D, LeakyReLU, BatchNormalization, MaxPooling2D, Dropout, Flatten, Dense


class miRBind_CNN():
"""
Build model architecture based on the CNN model presented in miRBind paper (2022) https://doi.org/10.3390/genes13122323
The default parameters are same as the ones used in the paper
"""
def __init__(self, cnn_num = 6, kernel_size = 5, pool_size = 2, dropout_rate = 0.3, dense_num = 2):

x = Input(shape=(50,20,1), dtype='float32')
main_input = x

for cnn_i in range(cnn_num):
x = Conv2D(
filters=32 * (cnn_i + 1),
kernel_size=(kernel_size, kernel_size),
padding="same",
data_format="channels_last")(x)
x = LeakyReLU()(x)
x = BatchNormalization()(x)
x = MaxPooling2D(pool_size=(pool_size, pool_size), padding='same')(x)
x = Dropout(rate=dropout_rate)(x)

x = Flatten()(x)

for dense_i in range(dense_num):
neurons = 32 * (cnn_num - dense_i)
x = Dense(neurons)(x)
x = LeakyReLU()(x)
x = BatchNormalization()(x)
x = Dropout(rate=dropout_rate)(x)

main_output = Dense(1, activation='sigmoid')(x)

model = K.Model(inputs=[main_input], outputs=[main_output], name='miRBind_CNN')

self.model = model

def compile_model(self, lr=0.00152):
K.backend.clear_session()
model = self.model

opt = Adam(
learning_rate=lr,
beta_1=0.9,
beta_2=0.999,
epsilon=1e-07,
amsgrad=False,
name="Adam")

model.compile(
optimizer=opt,
loss='binary_crossentropy',
metrics=['accuracy']
)
return model
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
import numpy as np
import argparse
import time
import tensorflow as tf
from tensorflow import keras as K
import matplotlib.pyplot as plt
from tensorflow.keras.utils import Sequence

from miRBind_CNN_architecture import miRBind_CNN


class DataGenerator(Sequence):
# preload the encoded numpy data
def __init__(self, data_path, labels_path, dataset_size, batch_size, validation_split=0.1,
is_validation=False, shuffle=True):
# the dataset size is needed to properly load the numpy files
self.size = dataset_size

self.data = np.memmap(data_path, dtype='float32', mode='r', shape=(self.size, 50, 20, 1))
self.labels = np.memmap(labels_path, dtype='float32', mode='r', shape=(self.size,))
self.batch_size = batch_size
self.shuffle = shuffle

# Determine number of train and validation samples
self.validation_split = validation_split
self.num_samples = len(self.data)
self.num_validation_samples = int(self.num_samples * validation_split)
self.num_train_samples = self.num_samples - self.num_validation_samples

# Determine indices for validation and training
indices = np.arange(self.num_samples)
if shuffle:
np.random.shuffle(indices)

if is_validation:
self.indices = indices[self.num_train_samples:]
else:
self.indices = indices[:self.num_train_samples]

# Shuffle the data initially
self.on_epoch_end()

def __len__(self):
# Denotes the number of batches per epoch
return int(np.ceil(len(self.indices) / float(self.batch_size)))

def __getitem__(self, idx):
# Generate one batch of data
batch_indices = self.indices[idx * self.batch_size:(idx + 1) * self.batch_size]
batch_data = self.data[batch_indices]
batch_labels = self.labels[batch_indices]
return batch_data, batch_labels

def on_epoch_end(self):
# Updates indices after each epoch for shuffling
if self.shuffle:
np.random.shuffle(self.indices)


def plot_history(history, ratio):
"""
Plot history of the model training,
accuracy and loss of the training and validation set
"""

acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(1, len(acc) + 1)

plt.figure(figsize=(8, 6), dpi=80)

plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Accuracy')
plt.legend()
plt.savefig(f"training_acc_1_{ratio}.jpg")

plt.figure()

plt.plot(epochs, loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Loss')
plt.legend()
plt.savefig(f"training_loss_1_{ratio}.jpg")


def train_model(data, labels, dataset_size, ratio, model_file, debug=False):
# set random state for reproducibility
np.random.seed(42)
tf.random.set_seed(42)
K.utils.set_random_seed(42)
# TODO still not fully reproducible? why?

train_data_gen = DataGenerator(data, labels, dataset_size, batch_size=32, validation_split=0.1,
is_validation=False)
val_data_gen = DataGenerator(data, labels, dataset_size, batch_size=32, validation_split=0.1,
is_validation=True)

model = miRBind_CNN().compile_model()
model_history = model.fit(
train_data_gen,
validation_data=val_data_gen,
epochs=10,
class_weight={0: 1, 1: ratio}
)

if debug:
plot_history(model_history, ratio)

model.save(model_file)


def main():
parser = argparse.ArgumentParser(description="Train CNN model on encoded miRNA x target binding matrix dataset")
parser.add_argument('--ratio', type=int, required=True, help="Ratio of pos:neg in the training dataset")
parser.add_argument('--data', type=str, required=True, help="File with the encoded dataset")
parser.add_argument('--labels', type=str, required=True, help="File with the dataset labels")
parser.add_argument('--dataset_size', type=int, required=True,
help="Number of samples in the dataset. Needed to properly load the numpy files.")
parser.add_argument('--model', type=str, required=False, help="Filename to save the trained model")
parser.add_argument('--debug', type=bool, default=False, help="Set to True to output some plots about training")
args = parser.parse_args()

if args.model is None:
args.model = f"model_1_{args.ratio}.keras"

start = time.time()
train_model(data=args.data, labels=args.labels, dataset_size=args.dataset_size, ratio=args.ratio,
model_file=args.model, debug=args.debug)
end = time.time()

print("Elapsed time: ", end - start, " s.")


if __name__ == "__main__":
main()
Loading