Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

quantms runs #550

Draft
wants to merge 9 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
174 changes: 174 additions & 0 deletions proteobench/io/params/quantms.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,174 @@
"""quantms is a nextflow pipeline that execution depends on the settings in an
SDRF file. It is executed using a parameters file in JSON format.

However, the version of packages are dumped to a versions yaml file. And some parameters
are taken from the SDRF file.
"""

import json
import logging
import pathlib
from typing import IO, Union

import pandas as pd
import yaml

from proteobench.io.params import ProteoBenchParameters

logger = logging.getLogger(__name__)


def load_versions(file: IO) -> dict:
"""
Load the versions of the tools used in the quantms pipeline.
"""
versions = yaml.safe_load(file)
return versions


def load_parsed_sdrf(file: Union[str, pathlib.Path, IO]) -> pd.DataFrame:
"""
Load the parsed SDRF file.
"""
return pd.read_csv(file, sep="\t")


def load_files(file1: IO, file2: IO, file3: IO) -> [dict, pd.DataFrame]:
"""Load file independent of order they are provided in."""
versions = None
sdrf = None
pipeline_params = None
for file in [file1, file2, file3]:
try:
_versions = load_versions(file)
if "Workflow" not in _versions:
logger.debug("Loaded other file.")
file.seek(0)
elif versions is None:
versions = _versions
continue
elif "custom_config_base" in _versions:
logger.debug("Loaded nextflow parameters file.")
else:
raise ValueError("Multiple version files provided.")
except yaml.YAMLError as e:
file.seek(0)

try:
# file.seek(0)
_pipeline_params = json.load(file)
if pipeline_params is None:
pipeline_params = _pipeline_params
continue
else:
raise ValueError("Multiple parameter files provided.")
except json.JSONDecodeError as e:
print(e)
file.seek(0)

try:
# file.seek(0)
_sdrf = load_parsed_sdrf(file)
if _sdrf.shape[1] == 1:
logger.debug("Loaded version or parameter file. Skip")
continue
elif sdrf is None:
sdrf = _sdrf
else:
raise ValueError("Multiple SDRF files provided.")
except pd.errors.EmptyDataError as e:
pass

assert versions is not None
assert sdrf is not None
assert pipeline_params is not None

return versions, sdrf, pipeline_params


def extract_params(file1: IO, file2: IO, file3: IO) -> ProteoBenchParameters:
"""
Extract parameters from the parsed SDRF and version file. We use both the parsed
SDRF file and the yaml file of versions to extract the parameters. The function
needs to be able to handle any order of files as the streamlit interfaces does
allow the user to select any order.

This might be changed in a newer quantms version with one central parameters
file.
"""
versions, sdrf, pipeline_params = load_files(file1, file2, file3)

params = ProteoBenchParameters()
params.software_name = "quantms"
params.software_version = versions["Workflow"]["bigbio/quantms"]
engines = list()
engines_version = list()
for key in versions:
if key.startswith("SEARCHENGINE"):
_engine = key.split("SEARCHENGINE")[-1].lower()
engines.append(_engine)
if _engine == "comet":
engines_version.append(versions[key]["Comet"])
elif _engine == "msgf":
versions.append(versions[key]["msgf_plus"])
else:
raise ValueError(f"Unknown search engine: {_engine}")
if engines:
params.search_engine = ",".join(engines)
if engines_version:
params.search_engine_version = ",".join(engines_version)

# "fdr_level": "psm_level_fdrs",
params.ident_fdr_psm = pipeline_params["psm_level_fdr_cutoff"]
params.ident_fdr_protein = pipeline_params["protein_level_fdr_cutoff"]
params.variable_mods = pipeline_params["variable_mods"]
params.fixed_mods = pipeline_params["fixed_mods"]
params.max_mods = pipeline_params["max_mods"]
params.min_precursor_charge = pipeline_params["min_precursor_charge"]
params.max_precursor_charge = pipeline_params["max_precursor_charge"]
params.max_peptide_length = pipeline_params["max_peptide_length"]
params.min_peptide_length = pipeline_params["min_peptide_length"]
params.precursor_mass_tolerance = pipeline_params["precursor_mass_tolerance"]
params.fragment_mass_tolerance = pipeline_params["fragment_mass_tolerance"]
params.allowed_miscleavages = pipeline_params["allowed_missed_cleavages"]
params.quantification_method = pipeline_params["quantification_method"]
params.protein_inference = pipeline_params["protein_inference_method"]

# maybe (also) in sdrf infos?
# params.quantification_method =
# params.protein_inference =
# params.abundance_normalization_ions =

return (versions, sdrf, pipeline_params, params)


if __name__ == "__main__":

from pathlib import Path

fpath1 = Path("../../../test/params/quantms_1-3.sdrf_config.tsv")
fpath2 = Path("../../../test/params/quantms_1-3.nf_core_quantms_software_mqc_versions.yml")
fpath3 = Path("../../../test/params/quantms_1-3_dev.json")

# Extract parameters from the fileP
with open(fpath1, "r") as file1, open(fpath2, "r") as file2, open(fpath3, "r") as file3:
versions, sdrf, pipeline_params, params = extract_params(file1, file2, file3)
display(params.__dict__)

import itertools

permutations_fpath = list(itertools.permutations([fpath1, fpath2, fpath3]))
for file1, file2, file3 in permutations_fpath:
print(file1.name, file2.name, file3.name)
with open(file1, "r") as f1, open(file2, "r") as f2, open(file3, "r") as f3:
_versions, _sdrf, _pipeline_params, params = extract_params(f1, f2, f3)
assert _versions == versions
assert _sdrf.equals(sdrf)
assert _pipeline_params == pipeline_params
# display(params.__dict__)

# Convert the extracted parameters to a dictioPnary and then to a pandas Series
# data_dict = params.__dict__
# series = pd.Series(data_dict)
# # Write the Series to a CSV file
# series.to_csv(file.with_suffix(".csv"))
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
[mapper]
"ProteinName" = "Proteins"
"Sequence" = "Sequence"
"PrecursorCharge" = "Charge"
"Reference" = "Raw file"
"PeptideSequence" = "Modified sequence"

[condition_mapper]
"LFQ_Orbitrap_DDA_Condition_A_Sample_Alpha_01.mzML" = "A"
"LFQ_Orbitrap_DDA_Condition_A_Sample_Alpha_02.mzML" = "A"
"LFQ_Orbitrap_DDA_Condition_A_Sample_Alpha_03.mzML" = "A"
"LFQ_Orbitrap_DDA_Condition_B_Sample_Alpha_01.mzML" = "B"
"LFQ_Orbitrap_DDA_Condition_B_Sample_Alpha_02.mzML" = "B"
"LFQ_Orbitrap_DDA_Condition_B_Sample_Alpha_03.mzML" = "B"

[run_mapper]
"LFQ_Orbitrap_DDA_Condition_A_Sample_Alpha_01.mzML" = "Condition_A_Sample_Alpha_01"
"LFQ_Orbitrap_DDA_Condition_A_Sample_Alpha_02.mzML" = "Condition_A_Sample_Alpha_02"
"LFQ_Orbitrap_DDA_Condition_A_Sample_Alpha_03.mzML" = "Condition_A_Sample_Alpha_03"
"LFQ_Orbitrap_DDA_Condition_B_Sample_Alpha_01.mzML" = "Condition_B_Sample_Alpha_01"
"LFQ_Orbitrap_DDA_Condition_B_Sample_Alpha_02.mzML" = "Condition_B_Sample_Alpha_02"
"LFQ_Orbitrap_DDA_Condition_B_Sample_Alpha_03.mzML" = "Condition_B_Sample_Alpha_03"


[species_mapper]
"_YEAST" = "YEAST"
"_ECOLI" = "ECOLI"
"_HUMAN" = "HUMAN"


[general]
"contaminant_flag" = "Cont_"
"decoy_flag" = false
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
"MSAngel" = "parse_settings_msangel.toml"
"Sage" = "parse_settings_sage.toml"
"PEAKS" = "parse_settings_peaks.toml"
"quantms" = "parse_settings_msstats.toml"
"Custom" = "parse_settings_custom.toml"

[quant_lfq_peptidoform_DDA]
Expand Down
14 changes: 12 additions & 2 deletions proteobench/io/parsing/parse_ion.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import math
import os
import re
from typing import Dict, List, Optional

import pandas as pd

Expand Down Expand Up @@ -109,7 +108,18 @@ def load_input_file(input_csv: str, input_format: str) -> pd.DataFrame:
input_data_frame = pd.read_csv(input_csv, low_memory=False, sep="\t")
elif input_format == "PEAKS":
input_data_frame = pd.read_csv(input_csv, low_memory=False, sep=",")

elif input_format == "quantms":
input_data_frame = pd.read_csv(input_csv, low_memory=False)
input_data_frame = input_data_frame.assign(
proforma=input_data_frame["PeptideSequence"].str.replace(
r"\(([^)]+)\)",
r"",
regex=True,
),
)
input_data_frame["Sequence"] = input_data_frame["PeptideSequence"].str.replace(r"\(([^)]+)\)", r"", regex=True)
else:
raise ValueError(f"Input format '{input_format}' not recognized.")
return input_data_frame


Expand Down
10 changes: 6 additions & 4 deletions proteobench/io/parsing/parse_settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,14 +171,16 @@ def convert_to_standard_format(self, df: pd.DataFrame) -> tuple[pd.DataFrame, Di
df_filtered_melted["proforma"] + "|Z=" + df_filtered_melted["Charge"].astype(str)
)
else:
print("Not all columns required for making the ion are available.")
# ! raise ValueError
print("Not all columns required for making the ion are available: 'proforma' and 'Charge'.")
return df_filtered_melted, replicate_to_raw

elif self.analysis_level == "peptidoform":
if "proforma" in df_filtered_melted.columns:
df_filtered_melted["peptidoform"] = df_filtered_melted["proforma"]
else:
print("Not all columns required for making the peptidoform are available.")
# ! raise ValueError
print("Not all columns required for making the peptidoform are available: 'proforma'.")
return df_filtered_melted, replicate_to_raw

else:
Expand Down Expand Up @@ -244,11 +246,11 @@ def convert_to_standard_format(self, df: pd.DataFrame) -> tuple[pd.DataFrame, Di
if self.parser.analysis_level == "ion":
try:
df["precursor ion"] = df["proforma"] + "|Z=" + df["Charge"].astype(str)
except KeyError:
except KeyError as e:
raise KeyError(
"Not all columns required for making the ion are available."
" Is the charge available in the input file?"
)
) from e

return df, replicate_to_raw

Expand Down
23 changes: 13 additions & 10 deletions proteobench/modules/quant/lfq/ion/DDA/quant_lfq_ion_DDA.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,42 +113,45 @@ def benchmarking(
except pd.errors.ParserError as e:
raise ParseError(
f"Error parsing {input_format} file, please make sure the format is correct and the correct software tool is chosen: {e}"
)
) from e
except Exception as e:
raise ParseSettingsError(f"Error parsing the input file: {e}")
raise ParseSettingsError("Error parsing the input file.") from e

msg = f"Folder: {self.parse_settings_dir}, Module: {self.module_id}"
# Parse settings file
try:
parse_settings = ParseSettingsBuilder(
parse_settings_dir=self.parse_settings_dir, module_id=self.module_id
).build_parser(input_format)
except KeyError as e:
raise ParseSettingsError(f"Error parsing settings file for parsing, settings seem to be missing: {e}")
raise ParseSettingsError(
f"Error parsing settings file for parsing, settings seem to be missing: {msg}"
) from e
except FileNotFoundError as e:
raise ParseSettingsError(f"Could not find the parsing settings file: {e}")
raise ParseSettingsError(f"Could not find the parsing settings file: {msg}") from e
except Exception as e:
raise ParseSettingsError(f"Error parsing settings file for parsing: {e}")
raise ParseSettingsError(f"Error parsing settings file for parsing: {msg}") from e

try:
standard_format, replicate_to_raw = parse_settings.convert_to_standard_format(input_df)
except KeyError as e:
raise ConvertStandardFormatError(f"Error converting to standard format, key missing: {e}")
raise ConvertStandardFormatError("Error converting to standard format, key missing.") from e
except Exception as e:
raise ConvertStandardFormatError(f"Error converting to standard format: {e}")
raise ConvertStandardFormatError("Error converting to standard format.") from e

# calculate quantification scores
# instantiate quantification scores
try:
quant_score = QuantScores(
self.precursor_name, parse_settings.species_expected_ratio(), parse_settings.species_dict()
)
except Exception as e:
raise QuantificationError(f"Error generating quantification scores: {e}")
raise QuantificationError("Error generating quantification scores.") from e

# generate intermediate data structure
try:
intermediate_data_structure = quant_score.generate_intermediate(standard_format, replicate_to_raw)
except Exception as e:
raise IntermediateFormatGenerationError(f"Error generating intermediate data structure: {e}")
raise IntermediateFormatGenerationError("Error generating intermediate data structure.") from e

# try:
current_datapoint = Datapoint.generate_datapoint(
Expand Down
1 change: 1 addition & 0 deletions proteobench/plotting/plot_quant.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,7 @@ def plot_metric(
"MSAID": "#afff57",
"Proteome Discoverer": "#8c564b",
"PEAKS": "#f781bf",
"quantms": "#03fc39",
},
mapping: Dict[str, int] = {"old": 10, "new": 20},
highlight_color: str = "#d30067",
Expand Down
Loading
Loading