Merge pull request #543 from Proteobench/msangel-compatibility

Msangel compatibility
Proteobench · Jan 27, 2025 · 24e7ff1 · 24e7ff1
2 parents b52149e + 8f8f2c7
commit 24e7ff1
Show file tree

Hide file tree

Showing 40 changed files with 1,577 additions and 367 deletions.
diff --git a/proteobench/github/gh.py b/proteobench/github/gh.py
@@ -78,31 +78,82 @@ def clone(remote_url: str, clone_dir: str) -> Repo:
         try:
             repo = Repo(clone_dir)
         except (exc.NoSuchPathError, exc.InvalidGitRepositoryError):
-            repo = Repo.clone_from(remote_url.rstrip("/"), clone_dir)
+            repo = Repo.clone_from(remote_url.rstrip("/"), clone_dir, depth=1, no_single_branch=True)
+        return repo
+
+    @staticmethod
+    def shallow_clone(remote_url: str, clone_dir: str) -> Repo:
+        """
+        Performs a shallow clone of the repository (only the latest commit).
+
+        Args:
+            remote_url (str): The repository URL.
+            clone_dir (str): The target directory for cloning.
+
+        Returns:
+            Repo: The cloned repository object.
+        """
+        if os.path.exists(clone_dir):
+            print(f"Repository already exists in {clone_dir}. Trying to use existing files.")
+            try:
+                return Repo(clone_dir)
+            except exc.InvalidGitRepositoryError:
+                print(f"Repository invalid, will clone again.")
+
+        try:
+            repo = Repo.clone_from(remote_url.rstrip("/"), clone_dir, depth=1, no_single_branch=True)
+        except exc.GitCommandError as e:
+            raise RuntimeError(f"Failed to clone the repository: {e}")
+
         return repo
 
     def clone_repo_anonymous(self) -> Repo:
         """
-        Clones the Proteobench repository anonymously (without authentication).
+        Clones the Proteobench repository anonymously with a shallow clone (without authentication).
 
         Returns:
-            Repo: The local repository object.
+            Repo: The cloned repository object.
         """
         remote_url = self.get_remote_url_anon()
-        repo = self.clone(remote_url, self.clone_dir)
-        return repo
+        self.repo = self.shallow_clone(remote_url, self.clone_dir)
+        return self.repo
 
-    def read_results_json_repo(self) -> pd.DataFrame:
+    def read_results_json_repo_single_file(self) -> pd.DataFrame:
         """
         Reads the `results.json` file from the cloned Proteobench repository and returns the data as a DataFrame.
 
         Returns:
             pd.DataFrame: A Pandas DataFrame containing the results from `results.json`.
         """
         f_name = os.path.join(self.clone_dir, "results.json")
+
+        if not os.path.exists(f_name):
+            raise FileNotFoundError(f"File '{f_name}' does not exist.")
+
         all_datapoints = pd.read_json(f_name)
         return all_datapoints
 
+    def read_results_json_repo(self) -> pd.DataFrame:
+        """
+        Reads all JSON result files from the cloned Proteobench repository.
+
+        Returns:
+            pd.DataFrame: A Pandas DataFrame containing aggregated results from multiple JSON files.
+        """
+        data = []
+        if not os.path.exists(self.clone_dir):
+            raise FileNotFoundError(f"Clone directory '{self.clone_dir}' does not exist.")
+
+        for file in os.listdir(self.clone_dir):
+            if file.endswith(".json") and file != "results.json":
+                file_path = os.path.join(self.clone_dir, file)
+                with open(file_path, "r") as f:
+                    data.append(pd.read_json(f, typ="series"))
+        if not data:
+            self.read_results_json_repo_single_file()
+
+        return pd.DataFrame(data)
+
     def clone_repo(self) -> Repo:
         """
         Clones the Proteobench repository using either an anonymous or authenticated GitHub access token.

diff --git a/proteobench/io/params/MSAngel.py b/proteobench/io/params/MSAngel.py
diff --git a/proteobench/io/params/__init__.py b/proteobench/io/params/__init__.py
@@ -1,95 +1,60 @@
-from dataclasses import dataclass
+# Reference for parameter names
+# https://github.com/bigbio/proteomics-sample-metadata/blob/master/sdrf-proteomics/assets/param2sdrf.yml
+import json
+import os
+from dataclasses import dataclass, field
 from typing import Optional
 
+import numpy as np
+
 
-# Reference for parameter names
-# https://github.com/bigbio/proteomics-sample-metadata/blob/master/sdrf-proteomics/assets/param2sdrf.yml
 @dataclass
 class ProteoBenchParameters:
-    """
-    Parameters for a proteomics search engine.
+    def __init__(
+        self, filename=os.path.join(os.path.dirname(__file__), "json/Quant/lfq/ion/DDA/fields.json"), **kwargs
+    ):
+        """
+        Reads the JSON file and initializes only the attributes present in the file.
+        """
+        if not os.path.isfile(filename):
+            print(f"Error: File '{filename}' not found.")
+            return  # No initialization happens if the file is missing
+
+        with open(filename, "r", encoding="utf-8") as file:
+            json_dict = json.load(file)
+
+        # Initialize only the fields present in the JSON
+        for key, value in json_dict.items():
+            if "value" in value:
+                setattr(self, key, value["value"])
+            elif "placeholder" in value:
+                setattr(self, key, value["placeholder"])
+            else:
+                setattr(self, key, None)
+
+        for key, value in kwargs.items():
+            print(key, value)
+            if hasattr(self, key) and value == "None":
+                setattr(self, key, np.nan)
+            elif hasattr(self, key):
+                setattr(self, key, value)
+
+    def __repr__(self):
+        """
+        Custom string representation to only show initialized attributes.
+        """
+        return str({key: value for key, value in self.__dict__.items() if value is not None})
+
+    def fill_none(self):
+        """
+        Fill all None values with np.nan
+        """
+        for key, value in self.__dict__.items():
+            if value == "None":
+                setattr(self, key, np.nan)
 
-    Attributes
-    ----------
-    software_name : Optional[str]
-        Name of the software tool / pipeline used for this benchmark run
-        (examples: "MaxQuant", "AlphaPept", "Proline", ...).
-    software_version : Optional[str]
-        Version of the software tool / pipeline used for this benchmark run
-    search_engine: Optional[str]
-        Search engine used for this benchmark run
-        (examples: "Andromeda", "Mascot", ...).
-    search_engine_version : Optional[str]
-        Version of the search engine used for this benchmark run.
-    ident_fdr_psm : Optional[str]
-        False discovery rate (FDR) threshold for peptide-spectrum match
-        (PSM) validation ("0.01" = 1%).
-    ident_fdr_peptide : Optional[str]
-        False discovery rate (FDR) threshold for peptide validation ("0.01" = 1%).
-    ident_fdr_protein : Optional[str]
-        False discovery rate (FDR) threshold for protein validation ("0.01" = 1%).
-    enable_match_between_runs : Optional[bool]
-        Match between run (also named cross assignment) is enabled.
-    precursor_mass_tolerance : Optional[str]
-       Precursor mass tolerance used for the search.
-       Given as an interval of upper and lower tolerance, e.g. [-20 ppm, 20 ppm].
-    fragment_mass_tolerance : Optional[str]
-        Precursor mass tolerance used for the search:
-        Given as an interval of upper and lower tolerance, e.g. [-0.02 Da, 0.02 Da].
-    enzyme : Optional[str]
-        Enzyme used as parameter for the search. If several, use "|".
-    allowed_miscleavages : Optional[int]
-        Maximal number of missed cleavages allowed.
-    min_peptide_length : Optional[str]
-        Minimum peptide length (number of residues) allowed for the search.
-    max_peptide_length : Optional[str]
-        Maximum peptide length (number of residues) allowed for the search.
-    fixed_mods : Optional[str]
-        Fixed modifications searched for in the search. If several, separate with "|".
-    variable_mods : Optional[str]
-        Variable modifications searched for in the search. If several, separate with "|".
-    max_mods : Optional[int]
-        Maximal number of modifications per peptide
-        (including fixed and variable modifications).
-    min_precursor_charge : Optional[int]
-        Minimum precursor charge allowed.
-    max_precursor_charge : Optional[int]
-        Maximum precursor charge allowed.
-    spectral_library_generation : Optional[dict]
-        Models used to generate spectral library (DIA-specific).
-    scan_window : Optional[int]
-        Scan window radius. Ideally corresponds to approximate
-        average number of data points per peak (DIA-specific).
-    quantification_method_DIANN : Optional[str]
-        Quantification strategy used in the DIA-NN engine (DIANN-specific).
-    second_pass : Optional[bool]
-        Whether second pass search is enabled (DIANN-specific).
-    protein_inference : Optional[str]
-        Protein inference method used.
-    """
 
-    software_name: Optional[str] = None
-    software_version: Optional[str] = None
-    search_engine: Optional[str] = None
-    search_engine_version: Optional[str] = None
-    ident_fdr_psm: Optional[str] = None  # fdr_psm
-    ident_fdr_peptide: Optional[float] = None  # fdr_peptide
-    ident_fdr_protein: Optional[float] = None  # fdr_protein
-    enable_match_between_runs: Optional[bool] = None  # MBR
-    precursor_mass_tolerance: Optional[str] = None  # precursor_tol, precursor_tol_unit
-    fragment_mass_tolerance: Optional[str] = None  # fragment_tol, fragment_tol_unit
-    enzyme: Optional[str] = None  # enzyme_name
-    allowed_miscleavages: Optional[int] = None  # missed_cleavages
-    min_peptide_length: Optional[int] = None  # min_pep_length
-    max_peptide_length: Optional[int] = None  # max_pep_length
-    fixed_mods: Optional[str] = None  # fixed_modifications
-    variable_mods: Optional[str] = None  # variable_modifications
-    max_mods: Optional[int] = None  # max_num_modifications
-    min_precursor_charge: Optional[int] = None  # precursor_charge
-    max_precursor_charge: Optional[int] = None
-    scan_window: Optional[int] = None  # DIA-specific
-    quantification_method: Optional[str] = None  #
-    second_pass: Optional[bool] = None  # DIANN specific
-    protein_inference: Optional[str] = None  # example occams razor, proteinprophet
-    predictors_library: Optional[dict] = None  # type of model used to generate spectral library
-    abundance_normalization_ions: Optional[str] = None  # tic, median etc.
+# Automatically initialize from fields.json if run directly
+if __name__ == "__main__":
+    proteo_params = ProteoBenchParameters()
+    print(proteo_params)
diff --git a/proteobench/io/params/alphapept.py b/proteobench/io/params/alphapept.py
@@ -67,6 +67,7 @@ def extract_params(fname: pathlib.Path) -> ProteoBenchParameters:
     params.max_precursor_charge = record["features"]["iso_charge_max"]
     params.enable_match_between_runs = record["workflow"]["match"]  # Check if matching is enabled
     params.abundance_normalization_ions = None  # No normalization in AlphaPept
+    params.fill_none()
     return params
 
 

diff --git a/proteobench/io/params/fragger.py b/proteobench/io/params/fragger.py
@@ -192,6 +192,8 @@ def extract_params(file: BytesIO) -> ProteoBenchParameters:
     if fragpipe_params.loc["protein-prophet.run-protein-prophet"] == "true":
         params.protein_inference = f"ProteinProphet: {fragpipe_params.loc['protein-prophet.cmd-opts']}"
 
+    params.fill_none()
+
     return params
 
 

diff --git a/proteobench/io/params/i2masschroq.py b/proteobench/io/params/i2masschroq.py
@@ -67,6 +67,7 @@ def _extract_xtandem_params(params: pd.Series) -> ProteoBenchParameters:
         min_precursor_charge=1,
         max_precursor_charge=int(params.loc["spectrum, maximum parent charge"]),
     )
+    params.fill_none()
     return params
 
 
@@ -120,6 +121,7 @@ def _extract_sage_params(params: pd.Series) -> ProteoBenchParameters:
         min_precursor_charge=int(min_precursor_charge),
         max_precursor_charge=int(max_precursor_charge),
     )
+    params.fill_none()
     return params