Skip to content

Commit

Permalink
Initial commit
Browse files Browse the repository at this point in the history
Initial commit with the training code and evaluation of the
machine learning model for protein-protein interaction prediction.
  • Loading branch information
albanogueira committed Apr 15, 2024
0 parents commit 7e339d9
Show file tree
Hide file tree
Showing 15 changed files with 1,006 additions and 0 deletions.
6 changes: 6 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
logs
*.h5

.venv
.vscode/
__pycache__
21 changes: 21 additions & 0 deletions LICENSE.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
Under the MIT License:

Copyright (c) <2024> <A. Nogueira-Rodríguez, Daniel Glez-Peña, Cristina P. Vieira, Jorge Vieira, H. López-Fernández>

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
46 changes: 46 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
# Running the experiments

1. Edit the `EXPERIMENT CONFIGURATION` section of the `analysis.py` file to set the appropriate experimental setup.
2. Run `python3 analysis.py configuration_experiments [experiment_name]` script (the first argument is the name of the file with the selected configuration, and the second argument, optional, is an additional name for the experiment logs folder).

If you do not want that python uses buffered output, which is useful when you want to see stdout logs as soon as they are produced, especially when the stdout is written to a file (e.g. nohup), where large buffers are used that may retain the output for a while, run python with `-u` option (unbuffered).

For example, `python3 -u analysis.py configuration_experiments KNN`


The models developed in this study are:
<ul>
<li><strong>PPIIBM_first_item</strong>, Pair Prediction by Item Identification Baseline Model (first item mode)</li>
<li><strong>PPIIBM_both_items</strong>, Pair Prediction by Item Identification Baseline Model (both items mode)</li>
</ul>

The classic machine learning models that can be selected are:
<ul>
<li><strong>KNN</strong>, k-nearest neighbors</li>
<li><strong>LR</strong>, logistic regression classifier</li>
<li><strong>RF</strong>, random forest classifier</li>
<li><strong>SVC</strong>, Support Vector Classifier</li>
</ul>



# Creating the virtual environment

## Python venv

```
python3 -m venv .venv
source .venv/bin/activate
pip install -r requirements.txt
```

# Running with GPU
It is necessary to have Conda previously installed on your system.

Creating the Conda environment with RAPIDS:

```
conda create -n rapids-24.02 -c rapidsai -c conda-forge -c nvidia cuml=24.02 python=3.10 cuda-version=11.8
```

Once the environment is activated, you can run the Python scripts to use RAPIDS and execute them on GPU by changing the `use_GPU = True` flag in the experiment configuration file.
131 changes: 131 additions & 0 deletions analysis.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
import os
import time
import numpy as np

from datetime import datetime
from sklearn.model_selection import train_test_split

import functions as fn
import scoring as sc
import importlib
import sys

from datasets import load_h5_as_df
from print import pr_cyan, pr_green, pr_red, pr_yellow, pr_orange

def dump_configuration(file, vars):
with open(file, 'w', encoding='utf-8') as output_file:
for var in vars:
output_file.write(f'{var}={eval(var)}\n')

def import_module(module_name):
try:
# Import the module dynamically
module = importlib.import_module(module_name)
pr_green(f"Successfully imported module: {module_name}")

# Add module attributes to global namespace
for attr_name in dir(module):
if not attr_name.startswith('__'):
globals()[attr_name] = getattr(module, attr_name)
except ImportError:
pr_red(f"Failed to import module: {module_name}")

if len(sys.argv) < 2 or len(sys.argv) > 3:
print("Usage: python analysis.py <configuration_module_name> [experiment_name]")
exit(1)

module_name = os.path.splitext(sys.argv[1])[0]
import_module(module_name)
required_variables = [
'random_state', 'test_size', 'n_jobs', 'shuffle',
'use_GPU', 'datasets', 'models', 'embeddings_combinators',
'nested_cv_outer_splits', 'nested_cv_inner_splits', 'per_fold',
'print_debug_messages'
]

for var_name in required_variables:
if var_name not in globals():
raise ImportError(f"Variable '{var_name}' is not imported from module '{module_name}'")

experiment_name=''
if len(sys.argv) > 2:
experiment_name = '_' + sys.argv[2]

timestamp = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
logs_dir = f'logs/{timestamp}{experiment_name}'
os.makedirs(logs_dir)

config_log = f'{logs_dir}/_CONFIG.txt'
dump_configuration(config_log, required_variables)

#
# The results* dictionaries will contain dataset names in the first level of keys.
# The second level will contain model names. They can be processed with "fn.results_report".
#
results = {}
results_log = f'{logs_dir}/_RESULTS_ALL.csv'

metrics = sc.DEFAULT_SCORING_DICT

metric_names = list(metrics.keys())
metric_names.extend(['time'])

fn.write_or_append_file(results_log, fn.results_csv_row_header(metric_names))

for dataset in datasets:
X, y = load_h5_as_df(dataset)
print(f'Loaded {dataset} dataset: X = {X.shape}, y = {y.shape}')

for combinator in embeddings_combinators:
dataset_name = f'{dataset}__{combinator}'
duplicated_labels = combinator.should_duplicate_labels()

if test_size > 0.0:
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=test_size, random_state=random_state, stratify=y)

if print_debug_messages:
pr_orange(f'Size after initial train/test split: {X_train.shape[0]} with test_size = {test_size}')

else:
pr_cyan('INFO: Using all data for external cross-validation as test_size = 0.0')
X_train = X
y_train = y

if shuffle:
pr_yellow('WARNING: shuffle is enabled!!')
dataset_name = 'SHUFFLE_' + dataset_name
np.random.shuffle(y_train)

results[dataset_name] = {}

for model in models:
model_name = model.name
print(f'Start new nested CV: {model_name} with {dataset_name}')
start_time = time.time()

pred_y_folds, true_y_folds, test_indexes = fn.do_nested_cv(
X_train, y_train, model.clf, model.param_grid,
combinator,
outer_splits=nested_cv_outer_splits,
inner_splits=nested_cv_inner_splits,
n_jobs=n_jobs,
print_debug_messages=print_debug_messages
)

total_time = time.time() - start_time
print(f'End nested CV: {model_name} with {dataset_name}. Execution time: {total_time:.3f}')

results[dataset_name][model_name] = fn.compute_metrics(pred_y_folds, true_y_folds, metrics, per_fold=per_fold)

results[dataset_name][model_name].update({'time': total_time})

fn.show_intermediate_results(
dataset_name, model_name, results[dataset_name][model_name], results_log, '\tCV results:')

fn.log_folds(pred_y_folds, true_y_folds, test_indexes, X_train, y_train, logs_dir, f'{dataset_name}__{model_name}')


print('\n', '# All interactions #')
fn.cat(results_log)
19 changes: 19 additions & 0 deletions configuration_experiments.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
import embeddings as em
import models.models as md

random_state = 2024
test_size = 0.2 # Use 0.0 to skip the initial train/test split
n_jobs = -1 # None means 1; -1 means all processors;
shuffle = False # Use True to run the sanity check
use_GPU = False # Use True to use the GPU
nested_cv_outer_splits = 5
nested_cv_inner_splits = 5
per_fold = True # True means that metris are computed for each fold separately and then averaged
print_debug_messages = False

datasets = ['dataset_clean_wei_seqvec.h5', 'dataset_clean_wei_protbert.h5']
models_to_exec = ['PPIIBM_first_item', 'PPIIBM_both_items']
models = md.prepare_models(models_to_exec, random_state, use_GPU)
embeddings_combinators = [
em.ConcatEmbeddings(add_inverted_interactions=False)
]
60 changes: 60 additions & 0 deletions datasets.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
import h5py
import pickle

import numpy as np
import pandas as pd


def compute_counts(df, target):
proteins = set(df['prot1']).union(set(df['prot2']))

counts_y_true_1 = {protein: 0 for protein in proteins}
counts_y_true_0 = {protein: 0 for protein in proteins}

for i, (_, row) in enumerate(df.iterrows()):
if target[i] == 1:
counts_y_true_1[row['prot1']] += 1
counts_y_true_1[row['prot2']] += 1
else:
counts_y_true_0[row['prot1']] += 1
counts_y_true_0[row['prot2']] += 1

return counts_y_true_1, counts_y_true_0


def create_ppi_dataset(prot1, emb_prot1, prot2, emb_prot2, target):
df = pd.DataFrame({
'prot1': prot1,
'emb_prot1': emb_prot1,
'prot2': prot2,
'emb_prot2': emb_prot2,
'target': target
})
df.to_numpy()

if df['emb_prot1'].shape[0] != df['emb_prot2'].shape[0]:
raise ValueError("Arrays emb_prot1 and emb_prot2 should have the same length")

X = df[['emb_prot1', 'emb_prot2', 'prot1', 'prot2']]

if df['target'].dtype == 'object':
y = df['target'].map({'True': True, 'False': False}).astype(int)
else:
y = df['target'].astype(int)


return X, np.array(y)


def load_h5_as_df(input_file):
with h5py.File(input_file, 'r') as h5:
serialized = h5['dataset'][()]
dataset = pickle.loads(serialized.tostring())

return create_ppi_dataset(
[row[0] for row in dataset],
[np.array(row[1]) for row in dataset], # Assuming row[1] is an ndarray
[row[2] for row in dataset],
[np.array(row[3]) for row in dataset], # Assuming row[3] is an ndarray
[row[4] for row in dataset] # Assuming row[4] is a boolean
)
Binary file added datasets.zip
Binary file not shown.
99 changes: 99 additions & 0 deletions embeddings.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
from abc import ABC, abstractmethod
import numpy as np

class AbstractMergeEmbeddings(ABC):

@abstractmethod
def should_duplicate_labels(self):
"""Determine if labels should be duplicated."""
pass

@abstractmethod
def unpack_embeddings(self, df_embeddings):
"""Unpack and process embeddings from the provided DataFrame."""
pass

@abstractmethod
def unpack_embeddings_test(self, df_embeddings):
"""Unpack and process embeddings for testing purposes."""
pass

@abstractmethod
def __str__(self):
"""Provide a meaningful string representation of the instance."""
pass

def __repr__(self):
return self.__str__()

def create_consecutive_groups(n_samples, n_groups):
group_size = n_samples // n_groups
remainder = n_samples % n_groups
groups = [i for i in range(n_groups) for _ in range(group_size)]
if remainder:
groups.extend([n_groups - 1] * remainder)
return groups

class AbstractNumpyMergeEmbeddings(AbstractMergeEmbeddings):

@abstractmethod
def merge_embeddings(self, embeddings_1, embeddings_2):
pass

def unpack_embeddings(self, df_embeddings):
rows = [self.merge_embeddings(x1.flatten(), x2.flatten()) for x1, x2 in zip(df_embeddings['emb_prot1'], df_embeddings['emb_prot2'])]
n_samples = len(rows)
groups = create_consecutive_groups(len(rows), n_samples)

return np.array(rows), groups

def unpack_embeddings_test(self, df_embeddings):
return self.unpack_embeddings(df_embeddings)

def should_duplicate_labels(self):
return False


class AddEmbeddings(AbstractNumpyMergeEmbeddings):

def __str__(self):
return f'add'

def merge_embeddings(self, embeddings_1, embeddings_2):
return np.add(embeddings_1, embeddings_2)


class MultiplyEmbeddings(AbstractNumpyMergeEmbeddings):

def __str__(self):
return f'multiply'

def merge_embeddings(self, embeddings_1, embeddings_2):
return np.multiply(embeddings_1, embeddings_2)

class ConcatEmbeddings(AbstractNumpyMergeEmbeddings):

def __init__(self, add_inverted_interactions=False):
self.add_inverted_interactions = add_inverted_interactions

def __str__(self):
return f'concat_invert_{self.add_inverted_interactions}'

def should_duplicate_labels(self):
return self.add_inverted_interactions

def merge_embeddings(self, embeddings_1, embeddings_2):
return np.concatenate((embeddings_1, embeddings_2))

def unpack_embeddings(self, df_embeddings):
rows, groups = AbstractNumpyMergeEmbeddings.unpack_embeddings(self, df_embeddings)

if self.add_inverted_interactions:
new_rows = np.array([np.concatenate((x2.flatten(), x1.flatten())) for x1, x2 in zip(df_embeddings['emb_prot1'], df_embeddings['emb_prot2'])])
rows = np.concatenate((rows, new_rows))
groups = groups * 2

return rows, groups

def unpack_embeddings_test(self, df_embeddings):
return AbstractNumpyMergeEmbeddings.unpack_embeddings(self, df_embeddings)
Loading

0 comments on commit 7e339d9

Please sign in to comment.