Initial commit

Initial commit with the training code and evaluation of the machine learning model for protein-protein interaction prediction.
sing-group · Apr 15, 2024 · 7e339d9 · 7e339d9
commit 7e339d9
Show file tree

Hide file tree

Showing 15 changed files with 1,006 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,6 @@
+logs
+*.h5
+
+.venv
+.vscode/
+__pycache__
diff --git a/LICENSE.md b/LICENSE.md
@@ -0,0 +1,21 @@
+Under the MIT License:
+
+Copyright (c) <2024> <A. Nogueira-Rodríguez, Daniel Glez-Peña, Cristina P. Vieira, Jorge Vieira, H. López-Fernández>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/README.md b/README.md
@@ -0,0 +1,46 @@
+# Running the experiments
+
+1. Edit the `EXPERIMENT CONFIGURATION` section of the `analysis.py` file to set the appropriate experimental setup.
+2. Run `python3 analysis.py configuration_experiments [experiment_name]` script (the first argument is the name of the file with the selected configuration, and the second argument, optional, is an additional name for the experiment logs folder).
+
+If you do not want that python uses buffered output, which is useful when you want to see stdout logs as soon as they are produced, especially when the stdout is written to a file (e.g. nohup), where large buffers are used that may retain the output for a while, run python with `-u` option (unbuffered).
+
+For example, `python3 -u analysis.py configuration_experiments KNN`
+
+
+The models developed in this study are:
+<ul>
+  <li><strong>PPIIBM_first_item</strong>, Pair Prediction by Item Identification Baseline Model (first item mode)</li>
+  <li><strong>PPIIBM_both_items</strong>, Pair Prediction by Item Identification Baseline Model (both items mode)</li>
+</ul> 
+
+The classic machine learning models that can be selected are:
+<ul>
+  <li><strong>KNN</strong>, k-nearest neighbors</li>
+  <li><strong>LR</strong>, logistic regression classifier</li>
+  <li><strong>RF</strong>, random forest classifier</li>
+  <li><strong>SVC</strong>, Support Vector Classifier</li>
+</ul>
+
+
+
+# Creating the virtual environment
+
+## Python venv
+
+```
+python3 -m venv .venv
+source .venv/bin/activate
+pip install -r requirements.txt
+```
+
+# Running with GPU
+It is necessary to have Conda previously installed on your system.
+
+Creating the Conda environment with RAPIDS:
+
+```
+conda create -n rapids-24.02 -c rapidsai -c conda-forge -c nvidia cuml=24.02 python=3.10 cuda-version=11.8
+```
+
+Once the environment is activated, you can run the Python scripts to use RAPIDS and execute them on GPU by changing the `use_GPU = True` flag in the experiment configuration file.
diff --git a/analysis.py b/analysis.py
@@ -0,0 +1,131 @@
+import os
+import time 
+import numpy as np
+
+from datetime import datetime
+from sklearn.model_selection import train_test_split
+
+import functions as fn
+import scoring as sc
+import importlib
+import sys
+
+from datasets import load_h5_as_df
+from print import pr_cyan, pr_green, pr_red, pr_yellow, pr_orange
+
+def dump_configuration(file, vars):
+    with open(file, 'w', encoding='utf-8') as output_file:
+        for var in vars:
+            output_file.write(f'{var}={eval(var)}\n')
+
+def import_module(module_name):
+    try:
+        # Import the module dynamically
+        module = importlib.import_module(module_name)
+        pr_green(f"Successfully imported module: {module_name}")
+
+        # Add module attributes to global namespace
+        for attr_name in dir(module):
+            if not attr_name.startswith('__'):
+                globals()[attr_name] = getattr(module, attr_name)
+    except ImportError:
+        pr_red(f"Failed to import module: {module_name}")
+
+if len(sys.argv) < 2 or len(sys.argv) > 3:
+    print("Usage: python analysis.py <configuration_module_name> [experiment_name]")
+    exit(1)
+
+module_name = os.path.splitext(sys.argv[1])[0]
+import_module(module_name)
+required_variables = [
+    'random_state', 'test_size', 'n_jobs', 'shuffle',
+    'use_GPU', 'datasets', 'models', 'embeddings_combinators',
+    'nested_cv_outer_splits', 'nested_cv_inner_splits', 'per_fold',
+    'print_debug_messages'
+]
+
+for var_name in required_variables:
+    if var_name not in globals():
+        raise ImportError(f"Variable '{var_name}' is not imported from module '{module_name}'")
+
+experiment_name=''
+if len(sys.argv) > 2:
+    experiment_name = '_' + sys.argv[2]
+
+timestamp = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
+logs_dir = f'logs/{timestamp}{experiment_name}'
+os.makedirs(logs_dir)
+
+config_log = f'{logs_dir}/_CONFIG.txt'
+dump_configuration(config_log, required_variables)
+
+#
+# The results* dictionaries will contain dataset names in the first level of keys.
+# The second level will contain model names. They can be processed with "fn.results_report".
+#
+results = {}
+results_log = f'{logs_dir}/_RESULTS_ALL.csv'
+
+metrics = sc.DEFAULT_SCORING_DICT
+
+metric_names = list(metrics.keys())
+metric_names.extend(['time'])
+
+fn.write_or_append_file(results_log, fn.results_csv_row_header(metric_names))
+
+for dataset in datasets:
+    X, y = load_h5_as_df(dataset)
+    print(f'Loaded {dataset} dataset: X = {X.shape}, y = {y.shape}')
+
+    for combinator in embeddings_combinators:
+        dataset_name = f'{dataset}__{combinator}'
+        duplicated_labels = combinator.should_duplicate_labels()
+
+        if test_size > 0.0:
+            X_train, X_test, y_train, y_test = train_test_split(
+                X, y, test_size=test_size, random_state=random_state, stratify=y)
+
+            if print_debug_messages:
+                pr_orange(f'Size after initial train/test split: {X_train.shape[0]} with test_size = {test_size}')
+
+        else:
+            pr_cyan('INFO: Using all data for external cross-validation as test_size = 0.0')
+            X_train = X
+            y_train = y
+
+        if shuffle:
+            pr_yellow('WARNING: shuffle is enabled!!')
+            dataset_name = 'SHUFFLE_' + dataset_name
+            np.random.shuffle(y_train)
+
+        results[dataset_name] = {}
+
+        for model in models:
+            model_name = model.name
+            print(f'Start new nested CV: {model_name} with {dataset_name}')
+            start_time = time.time()
+
+            pred_y_folds, true_y_folds, test_indexes = fn.do_nested_cv(
+                    X_train, y_train, model.clf, model.param_grid, 
+                    combinator,
+                    outer_splits=nested_cv_outer_splits,
+                    inner_splits=nested_cv_inner_splits,
+                    n_jobs=n_jobs,
+                    print_debug_messages=print_debug_messages
+                )
+
+            total_time = time.time() - start_time
+            print(f'End nested CV: {model_name} with {dataset_name}. Execution time: {total_time:.3f}')
+
+            results[dataset_name][model_name] = fn.compute_metrics(pred_y_folds, true_y_folds, metrics, per_fold=per_fold)
+
+            results[dataset_name][model_name].update({'time': total_time})
+
+            fn.show_intermediate_results(
+                dataset_name, model_name, results[dataset_name][model_name], results_log, '\tCV results:')
+
+            fn.log_folds(pred_y_folds, true_y_folds, test_indexes, X_train, y_train, logs_dir, f'{dataset_name}__{model_name}')
+
+
+print('\n', '# All interactions #')
+fn.cat(results_log)
diff --git a/configuration_experiments.py b/configuration_experiments.py
@@ -0,0 +1,19 @@
+import embeddings as em
+import models.models as md
+
+random_state = 2024
+test_size = 0.2 # Use 0.0 to skip the initial train/test split
+n_jobs = -1 # None means 1; -1 means all processors;
+shuffle = False # Use True to run the sanity check
+use_GPU = False # Use True to use the GPU
+nested_cv_outer_splits = 5
+nested_cv_inner_splits = 5
+per_fold = True # True means that metris are computed for each fold separately and then averaged
+print_debug_messages = False
+
+datasets = ['dataset_clean_wei_seqvec.h5', 'dataset_clean_wei_protbert.h5']
+models_to_exec = ['PPIIBM_first_item', 'PPIIBM_both_items']
+models = md.prepare_models(models_to_exec, random_state, use_GPU)
+embeddings_combinators = [
+    em.ConcatEmbeddings(add_inverted_interactions=False)
+]
diff --git a/datasets.py b/datasets.py
@@ -0,0 +1,60 @@
+import h5py
+import pickle
+
+import numpy as np
+import pandas as pd
+
+
+def compute_counts(df, target):
+    proteins = set(df['prot1']).union(set(df['prot2']))
+
+    counts_y_true_1 = {protein: 0 for protein in proteins}
+    counts_y_true_0 = {protein: 0 for protein in proteins}
+
+    for i, (_, row) in enumerate(df.iterrows()):
+        if target[i] == 1:
+            counts_y_true_1[row['prot1']] += 1
+            counts_y_true_1[row['prot2']] += 1
+        else:
+            counts_y_true_0[row['prot1']] += 1
+            counts_y_true_0[row['prot2']] += 1
+
+    return counts_y_true_1, counts_y_true_0
+
+
+def create_ppi_dataset(prot1, emb_prot1, prot2, emb_prot2, target):
+    df = pd.DataFrame({
+        'prot1': prot1,
+        'emb_prot1': emb_prot1,
+        'prot2': prot2,
+        'emb_prot2': emb_prot2,
+        'target': target
+    })
+    df.to_numpy()
+
+    if df['emb_prot1'].shape[0] != df['emb_prot2'].shape[0]:
+        raise ValueError("Arrays emb_prot1 and emb_prot2 should have the same length")
+
+    X = df[['emb_prot1', 'emb_prot2', 'prot1', 'prot2']]
+
+    if df['target'].dtype == 'object':
+        y = df['target'].map({'True': True, 'False': False}).astype(int)
+    else:
+        y = df['target'].astype(int)
+
+
+    return X, np.array(y)
+
+
+def load_h5_as_df(input_file):
+    with h5py.File(input_file, 'r') as h5:
+        serialized = h5['dataset'][()]
+        dataset = pickle.loads(serialized.tostring())
+
+        return create_ppi_dataset(
+            [row[0] for row in dataset],
+            [np.array(row[1]) for row in dataset], # Assuming row[1] is an ndarray
+            [row[2] for row in dataset],
+            [np.array(row[3]) for row in dataset], # Assuming row[3] is an ndarray
+            [row[4] for row in dataset] # Assuming row[4] is a boolean
+        )
diff --git a/datasets.zip b/datasets.zip
diff --git a/embeddings.py b/embeddings.py
@@ -0,0 +1,99 @@
+from abc import ABC, abstractmethod
+import numpy as np
+
+class AbstractMergeEmbeddings(ABC):
+
+    @abstractmethod
+    def should_duplicate_labels(self):
+        """Determine if labels should be duplicated."""
+        pass
+
+    @abstractmethod
+    def unpack_embeddings(self, df_embeddings):
+        """Unpack and process embeddings from the provided DataFrame."""
+        pass
+
+    @abstractmethod
+    def unpack_embeddings_test(self, df_embeddings):
+        """Unpack and process embeddings for testing purposes."""
+        pass
+
+    @abstractmethod
+    def __str__(self):
+        """Provide a meaningful string representation of the instance."""
+        pass
+
+    def __repr__(self):
+        return self.__str__()
+
+def create_consecutive_groups(n_samples, n_groups):
+    group_size = n_samples // n_groups
+    remainder = n_samples % n_groups
+    groups = [i for i in range(n_groups) for _ in range(group_size)]
+    if remainder:
+        groups.extend([n_groups - 1] * remainder)
+    return groups
+
+class AbstractNumpyMergeEmbeddings(AbstractMergeEmbeddings):
+
+    @abstractmethod
+    def merge_embeddings(self, embeddings_1, embeddings_2):
+        pass
+
+    def unpack_embeddings(self, df_embeddings):
+        rows = [self.merge_embeddings(x1.flatten(), x2.flatten()) for x1, x2 in zip(df_embeddings['emb_prot1'], df_embeddings['emb_prot2'])]
+        n_samples = len(rows)
+        groups = create_consecutive_groups(len(rows), n_samples)
+
+        return np.array(rows), groups
+
+    def unpack_embeddings_test(self, df_embeddings):
+        return self.unpack_embeddings(df_embeddings)
+
+    def should_duplicate_labels(self):
+        return False
+
+
+class AddEmbeddings(AbstractNumpyMergeEmbeddings):
+
+    def __str__(self):
+        return f'add'
+
+    def merge_embeddings(self, embeddings_1, embeddings_2):
+        return np.add(embeddings_1, embeddings_2)
+
+
+class MultiplyEmbeddings(AbstractNumpyMergeEmbeddings):
+
+    def __str__(self):
+        return f'multiply'
+
+    def merge_embeddings(self, embeddings_1, embeddings_2):
+        return np.multiply(embeddings_1, embeddings_2)
+
+class ConcatEmbeddings(AbstractNumpyMergeEmbeddings):
+
+    def __init__(self, add_inverted_interactions=False):
+        self.add_inverted_interactions = add_inverted_interactions
+
+    def __str__(self):
+        return f'concat_invert_{self.add_inverted_interactions}'
+
+    def should_duplicate_labels(self):
+        return self.add_inverted_interactions
+
+    def merge_embeddings(self, embeddings_1, embeddings_2):
+        return np.concatenate((embeddings_1, embeddings_2))
+
+    def unpack_embeddings(self, df_embeddings):
+        rows, groups = AbstractNumpyMergeEmbeddings.unpack_embeddings(self, df_embeddings)
+
+        if self.add_inverted_interactions:
+            new_rows = np.array([np.concatenate((x2.flatten(), x1.flatten())) for x1, x2 in zip(df_embeddings['emb_prot1'], df_embeddings['emb_prot2'])])
+            rows = np.concatenate((rows, new_rows))
+            groups = groups * 2
+
+        return rows, groups
+
+    def unpack_embeddings_test(self, df_embeddings):
+        return AbstractNumpyMergeEmbeddings.unpack_embeddings(self, df_embeddings)