Skip to content

Commit

Permalink
Merge pull request #543 from Proteobench/msangel-compatibility
Browse files Browse the repository at this point in the history
Msangel compatibility
  • Loading branch information
RobbinBouwmeester authored Jan 27, 2025
2 parents b52149e + 8f8f2c7 commit 24e7ff1
Show file tree
Hide file tree
Showing 40 changed files with 1,577 additions and 367 deletions.
63 changes: 57 additions & 6 deletions proteobench/github/gh.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,31 +78,82 @@ def clone(remote_url: str, clone_dir: str) -> Repo:
try:
repo = Repo(clone_dir)
except (exc.NoSuchPathError, exc.InvalidGitRepositoryError):
repo = Repo.clone_from(remote_url.rstrip("/"), clone_dir)
repo = Repo.clone_from(remote_url.rstrip("/"), clone_dir, depth=1, no_single_branch=True)
return repo

@staticmethod
def shallow_clone(remote_url: str, clone_dir: str) -> Repo:
"""
Performs a shallow clone of the repository (only the latest commit).
Args:
remote_url (str): The repository URL.
clone_dir (str): The target directory for cloning.
Returns:
Repo: The cloned repository object.
"""
if os.path.exists(clone_dir):
print(f"Repository already exists in {clone_dir}. Trying to use existing files.")
try:
return Repo(clone_dir)
except exc.InvalidGitRepositoryError:
print(f"Repository invalid, will clone again.")

try:
repo = Repo.clone_from(remote_url.rstrip("/"), clone_dir, depth=1, no_single_branch=True)
except exc.GitCommandError as e:
raise RuntimeError(f"Failed to clone the repository: {e}")

return repo

def clone_repo_anonymous(self) -> Repo:
"""
Clones the Proteobench repository anonymously (without authentication).
Clones the Proteobench repository anonymously with a shallow clone (without authentication).
Returns:
Repo: The local repository object.
Repo: The cloned repository object.
"""
remote_url = self.get_remote_url_anon()
repo = self.clone(remote_url, self.clone_dir)
return repo
self.repo = self.shallow_clone(remote_url, self.clone_dir)
return self.repo

def read_results_json_repo(self) -> pd.DataFrame:
def read_results_json_repo_single_file(self) -> pd.DataFrame:
"""
Reads the `results.json` file from the cloned Proteobench repository and returns the data as a DataFrame.
Returns:
pd.DataFrame: A Pandas DataFrame containing the results from `results.json`.
"""
f_name = os.path.join(self.clone_dir, "results.json")

if not os.path.exists(f_name):
raise FileNotFoundError(f"File '{f_name}' does not exist.")

all_datapoints = pd.read_json(f_name)
return all_datapoints

def read_results_json_repo(self) -> pd.DataFrame:
"""
Reads all JSON result files from the cloned Proteobench repository.
Returns:
pd.DataFrame: A Pandas DataFrame containing aggregated results from multiple JSON files.
"""
data = []
if not os.path.exists(self.clone_dir):
raise FileNotFoundError(f"Clone directory '{self.clone_dir}' does not exist.")

for file in os.listdir(self.clone_dir):
if file.endswith(".json") and file != "results.json":
file_path = os.path.join(self.clone_dir, file)
with open(file_path, "r") as f:
data.append(pd.read_json(f, typ="series"))
if not data:
self.read_results_json_repo_single_file()

return pd.DataFrame(data)

def clone_repo(self) -> Repo:
"""
Clones the Proteobench repository using either an anonymous or authenticated GitHub access token.
Expand Down
122 changes: 0 additions & 122 deletions proteobench/io/params/MSAngel.py

This file was deleted.

141 changes: 53 additions & 88 deletions proteobench/io/params/__init__.py
Original file line number Diff line number Diff line change
@@ -1,95 +1,60 @@
from dataclasses import dataclass
# Reference for parameter names
# https://github.com/bigbio/proteomics-sample-metadata/blob/master/sdrf-proteomics/assets/param2sdrf.yml
import json
import os
from dataclasses import dataclass, field
from typing import Optional

import numpy as np


# Reference for parameter names
# https://github.com/bigbio/proteomics-sample-metadata/blob/master/sdrf-proteomics/assets/param2sdrf.yml
@dataclass
class ProteoBenchParameters:
"""
Parameters for a proteomics search engine.
def __init__(
self, filename=os.path.join(os.path.dirname(__file__), "json/Quant/lfq/ion/DDA/fields.json"), **kwargs
):
"""
Reads the JSON file and initializes only the attributes present in the file.
"""
if not os.path.isfile(filename):
print(f"Error: File '{filename}' not found.")
return # No initialization happens if the file is missing

with open(filename, "r", encoding="utf-8") as file:
json_dict = json.load(file)

# Initialize only the fields present in the JSON
for key, value in json_dict.items():
if "value" in value:
setattr(self, key, value["value"])
elif "placeholder" in value:
setattr(self, key, value["placeholder"])
else:
setattr(self, key, None)

for key, value in kwargs.items():
print(key, value)
if hasattr(self, key) and value == "None":
setattr(self, key, np.nan)
elif hasattr(self, key):
setattr(self, key, value)

def __repr__(self):
"""
Custom string representation to only show initialized attributes.
"""
return str({key: value for key, value in self.__dict__.items() if value is not None})

def fill_none(self):
"""
Fill all None values with np.nan
"""
for key, value in self.__dict__.items():
if value == "None":
setattr(self, key, np.nan)

Attributes
----------
software_name : Optional[str]
Name of the software tool / pipeline used for this benchmark run
(examples: "MaxQuant", "AlphaPept", "Proline", ...).
software_version : Optional[str]
Version of the software tool / pipeline used for this benchmark run
search_engine: Optional[str]
Search engine used for this benchmark run
(examples: "Andromeda", "Mascot", ...).
search_engine_version : Optional[str]
Version of the search engine used for this benchmark run.
ident_fdr_psm : Optional[str]
False discovery rate (FDR) threshold for peptide-spectrum match
(PSM) validation ("0.01" = 1%).
ident_fdr_peptide : Optional[str]
False discovery rate (FDR) threshold for peptide validation ("0.01" = 1%).
ident_fdr_protein : Optional[str]
False discovery rate (FDR) threshold for protein validation ("0.01" = 1%).
enable_match_between_runs : Optional[bool]
Match between run (also named cross assignment) is enabled.
precursor_mass_tolerance : Optional[str]
Precursor mass tolerance used for the search.
Given as an interval of upper and lower tolerance, e.g. [-20 ppm, 20 ppm].
fragment_mass_tolerance : Optional[str]
Precursor mass tolerance used for the search:
Given as an interval of upper and lower tolerance, e.g. [-0.02 Da, 0.02 Da].
enzyme : Optional[str]
Enzyme used as parameter for the search. If several, use "|".
allowed_miscleavages : Optional[int]
Maximal number of missed cleavages allowed.
min_peptide_length : Optional[str]
Minimum peptide length (number of residues) allowed for the search.
max_peptide_length : Optional[str]
Maximum peptide length (number of residues) allowed for the search.
fixed_mods : Optional[str]
Fixed modifications searched for in the search. If several, separate with "|".
variable_mods : Optional[str]
Variable modifications searched for in the search. If several, separate with "|".
max_mods : Optional[int]
Maximal number of modifications per peptide
(including fixed and variable modifications).
min_precursor_charge : Optional[int]
Minimum precursor charge allowed.
max_precursor_charge : Optional[int]
Maximum precursor charge allowed.
spectral_library_generation : Optional[dict]
Models used to generate spectral library (DIA-specific).
scan_window : Optional[int]
Scan window radius. Ideally corresponds to approximate
average number of data points per peak (DIA-specific).
quantification_method_DIANN : Optional[str]
Quantification strategy used in the DIA-NN engine (DIANN-specific).
second_pass : Optional[bool]
Whether second pass search is enabled (DIANN-specific).
protein_inference : Optional[str]
Protein inference method used.
"""

software_name: Optional[str] = None
software_version: Optional[str] = None
search_engine: Optional[str] = None
search_engine_version: Optional[str] = None
ident_fdr_psm: Optional[str] = None # fdr_psm
ident_fdr_peptide: Optional[float] = None # fdr_peptide
ident_fdr_protein: Optional[float] = None # fdr_protein
enable_match_between_runs: Optional[bool] = None # MBR
precursor_mass_tolerance: Optional[str] = None # precursor_tol, precursor_tol_unit
fragment_mass_tolerance: Optional[str] = None # fragment_tol, fragment_tol_unit
enzyme: Optional[str] = None # enzyme_name
allowed_miscleavages: Optional[int] = None # missed_cleavages
min_peptide_length: Optional[int] = None # min_pep_length
max_peptide_length: Optional[int] = None # max_pep_length
fixed_mods: Optional[str] = None # fixed_modifications
variable_mods: Optional[str] = None # variable_modifications
max_mods: Optional[int] = None # max_num_modifications
min_precursor_charge: Optional[int] = None # precursor_charge
max_precursor_charge: Optional[int] = None
scan_window: Optional[int] = None # DIA-specific
quantification_method: Optional[str] = None #
second_pass: Optional[bool] = None # DIANN specific
protein_inference: Optional[str] = None # example occams razor, proteinprophet
predictors_library: Optional[dict] = None # type of model used to generate spectral library
abundance_normalization_ions: Optional[str] = None # tic, median etc.
# Automatically initialize from fields.json if run directly
if __name__ == "__main__":
proteo_params = ProteoBenchParameters()
print(proteo_params)
1 change: 1 addition & 0 deletions proteobench/io/params/alphapept.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ def extract_params(fname: pathlib.Path) -> ProteoBenchParameters:
params.max_precursor_charge = record["features"]["iso_charge_max"]
params.enable_match_between_runs = record["workflow"]["match"] # Check if matching is enabled
params.abundance_normalization_ions = None # No normalization in AlphaPept
params.fill_none()
return params


Expand Down
2 changes: 2 additions & 0 deletions proteobench/io/params/fragger.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,8 @@ def extract_params(file: BytesIO) -> ProteoBenchParameters:
if fragpipe_params.loc["protein-prophet.run-protein-prophet"] == "true":
params.protein_inference = f"ProteinProphet: {fragpipe_params.loc['protein-prophet.cmd-opts']}"

params.fill_none()

return params


Expand Down
2 changes: 2 additions & 0 deletions proteobench/io/params/i2masschroq.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ def _extract_xtandem_params(params: pd.Series) -> ProteoBenchParameters:
min_precursor_charge=1,
max_precursor_charge=int(params.loc["spectrum, maximum parent charge"]),
)
params.fill_none()
return params


Expand Down Expand Up @@ -120,6 +121,7 @@ def _extract_sage_params(params: pd.Series) -> ProteoBenchParameters:
min_precursor_charge=int(min_precursor_charge),
max_precursor_charge=int(max_precursor_charge),
)
params.fill_none()
return params


Expand Down
Loading

0 comments on commit 24e7ff1

Please sign in to comment.