Proteobench · enryH · Jan 25, 2025 · Jan 25, 2025 · Jan 26, 2025 · Jan 26, 2025
diff --git a/proteobench/io/params/quantms.py b/proteobench/io/params/quantms.py
@@ -0,0 +1,174 @@
+"""quantms is a nextflow pipeline that execution depends on the settings in an
+SDRF file. It is executed using a parameters file in JSON format.
+
+However, the version of packages are dumped to a versions yaml file. And some parameters
+are taken from the SDRF file.
+"""
+
+import json
+import logging
+import pathlib
+from typing import IO, Union
+
+import pandas as pd
+import yaml
+
+from proteobench.io.params import ProteoBenchParameters
+
+logger = logging.getLogger(__name__)
+
+
+def load_versions(file: IO) -> dict:
+    """
+    Load the versions of the tools used in the quantms pipeline.
+    """
+    versions = yaml.safe_load(file)
+    return versions
+
+
+def load_parsed_sdrf(file: Union[str, pathlib.Path, IO]) -> pd.DataFrame:
+    """
+    Load the parsed SDRF file.
+    """
+    return pd.read_csv(file, sep="\t")
+
+
+def load_files(file1: IO, file2: IO, file3: IO) -> [dict, pd.DataFrame]:
+    """Load file independent of order they are provided in."""
+    versions = None
+    sdrf = None
+    pipeline_params = None
+    for file in [file1, file2, file3]:
+        try:
+            _versions = load_versions(file)
+            if "Workflow" not in _versions:
+                logger.debug("Loaded other file.")
+                file.seek(0)
+            elif versions is None:
+                versions = _versions
+                continue
+            elif "custom_config_base" in _versions:
+                logger.debug("Loaded nextflow parameters file.")
+            else:
+                raise ValueError("Multiple version files provided.")
+        except yaml.YAMLError as e:
+            file.seek(0)
+
+        try:
+            # file.seek(0)
+            _pipeline_params = json.load(file)
+            if pipeline_params is None:
+                pipeline_params = _pipeline_params
+                continue
+            else:
+                raise ValueError("Multiple parameter files provided.")
+        except json.JSONDecodeError as e:
+            print(e)
+            file.seek(0)
+
+        try:
+            # file.seek(0)
+            _sdrf = load_parsed_sdrf(file)
+            if _sdrf.shape[1] == 1:
+                logger.debug("Loaded version or parameter file. Skip")
+                continue
+            elif sdrf is None:
+                sdrf = _sdrf
+            else:
+                raise ValueError("Multiple SDRF files provided.")
+        except pd.errors.EmptyDataError as e:
+            pass
+
+    assert versions is not None
+    assert sdrf is not None
+    assert pipeline_params is not None
+
+    return versions, sdrf, pipeline_params
+
+
+def extract_params(file1: IO, file2: IO, file3: IO) -> ProteoBenchParameters:
+    """
+    Extract parameters from the parsed SDRF and version file. We use both the parsed
+    SDRF file and the yaml file of versions to extract the parameters. The function
+    needs to be able to handle any order of files as the streamlit interfaces does
+    allow the user to select any order.
+
+    This might be changed in a newer quantms version with one central parameters
+    file.
+    """
+    versions, sdrf, pipeline_params = load_files(file1, file2, file3)
+
+    params = ProteoBenchParameters()
+    params.software_name = "quantms"
+    params.software_version = versions["Workflow"]["bigbio/quantms"]
+    engines = list()
+    engines_version = list()
+    for key in versions:
+        if key.startswith("SEARCHENGINE"):
+            _engine = key.split("SEARCHENGINE")[-1].lower()
+            engines.append(_engine)
+            if _engine == "comet":
+                engines_version.append(versions[key]["Comet"])
+            elif _engine == "msgf":
+                versions.append(versions[key]["msgf_plus"])
+            else:
+                raise ValueError(f"Unknown search engine: {_engine}")
+    if engines:
+        params.search_engine = ",".join(engines)
+    if engines_version:
+        params.search_engine_version = ",".join(engines_version)
+
+    # "fdr_level": "psm_level_fdrs",
+    params.ident_fdr_psm = pipeline_params["psm_level_fdr_cutoff"]
+    params.ident_fdr_protein = pipeline_params["protein_level_fdr_cutoff"]
+    params.variable_mods = pipeline_params["variable_mods"]
+    params.fixed_mods = pipeline_params["fixed_mods"]
+    params.max_mods = pipeline_params["max_mods"]
+    params.min_precursor_charge = pipeline_params["min_precursor_charge"]
+    params.max_precursor_charge = pipeline_params["max_precursor_charge"]
+    params.max_peptide_length = pipeline_params["max_peptide_length"]
+    params.min_peptide_length = pipeline_params["min_peptide_length"]
+    params.precursor_mass_tolerance = pipeline_params["precursor_mass_tolerance"]
+    params.fragment_mass_tolerance = pipeline_params["fragment_mass_tolerance"]
+    params.allowed_miscleavages = pipeline_params["allowed_missed_cleavages"]
+    params.quantification_method = pipeline_params["quantification_method"]
+    params.protein_inference = pipeline_params["protein_inference_method"]
+
+    # maybe (also) in sdrf infos?
+    # params.quantification_method =
+    # params.protein_inference =
+    # params.abundance_normalization_ions =
+
+    return (versions, sdrf, pipeline_params, params)
+
+
+if __name__ == "__main__":
+
+    from pathlib import Path
+
+    fpath1 = Path("../../../test/params/quantms_1-3.sdrf_config.tsv")
+    fpath2 = Path("../../../test/params/quantms_1-3.nf_core_quantms_software_mqc_versions.yml")
+    fpath3 = Path("../../../test/params/quantms_1-3_dev.json")
+
+    # Extract parameters from the fileP
+    with open(fpath1, "r") as file1, open(fpath2, "r") as file2, open(fpath3, "r") as file3:
+        versions, sdrf, pipeline_params, params = extract_params(file1, file2, file3)
+        display(params.__dict__)
+
+    import itertools
+
+    permutations_fpath = list(itertools.permutations([fpath1, fpath2, fpath3]))
+    for file1, file2, file3 in permutations_fpath:
+        print(file1.name, file2.name, file3.name)
+        with open(file1, "r") as f1, open(file2, "r") as f2, open(file3, "r") as f3:
+            _versions, _sdrf, _pipeline_params, params = extract_params(f1, f2, f3)
+            assert _versions == versions
+            assert _sdrf.equals(sdrf)
+            assert _pipeline_params == pipeline_params
+            # display(params.__dict__)
+
+    # Convert the extracted parameters to a dictioPnary and then to a pandas Series
+    # data_dict = params.__dict__
+    # series = pd.Series(data_dict)
+    # # Write the Series to a CSV file
+    # series.to_csv(file.with_suffix(".csv"))
diff --git a/proteobench/io/parsing/io_parse_settings/Quant/lfq/ion/DDA/parse_settings_msstats.toml b/proteobench/io/parsing/io_parse_settings/Quant/lfq/ion/DDA/parse_settings_msstats.toml
@@ -0,0 +1,33 @@
+[mapper]
+"ProteinName" = "Proteins"
+"Sequence" = "Sequence"
+"PrecursorCharge" = "Charge"
+"Reference" = "Raw file"
+"PeptideSequence" = "Modified sequence"
+
+[condition_mapper]
+"LFQ_Orbitrap_DDA_Condition_A_Sample_Alpha_01.mzML" = "A"
+"LFQ_Orbitrap_DDA_Condition_A_Sample_Alpha_02.mzML" = "A"
+"LFQ_Orbitrap_DDA_Condition_A_Sample_Alpha_03.mzML" = "A"
+"LFQ_Orbitrap_DDA_Condition_B_Sample_Alpha_01.mzML" = "B"
+"LFQ_Orbitrap_DDA_Condition_B_Sample_Alpha_02.mzML" = "B"
+"LFQ_Orbitrap_DDA_Condition_B_Sample_Alpha_03.mzML" = "B"
+
+[run_mapper]
+"LFQ_Orbitrap_DDA_Condition_A_Sample_Alpha_01.mzML" = "Condition_A_Sample_Alpha_01"
+"LFQ_Orbitrap_DDA_Condition_A_Sample_Alpha_02.mzML" = "Condition_A_Sample_Alpha_02"
+"LFQ_Orbitrap_DDA_Condition_A_Sample_Alpha_03.mzML" = "Condition_A_Sample_Alpha_03"
+"LFQ_Orbitrap_DDA_Condition_B_Sample_Alpha_01.mzML" = "Condition_B_Sample_Alpha_01"
+"LFQ_Orbitrap_DDA_Condition_B_Sample_Alpha_02.mzML" = "Condition_B_Sample_Alpha_02"
+"LFQ_Orbitrap_DDA_Condition_B_Sample_Alpha_03.mzML" = "Condition_B_Sample_Alpha_03"
+
+
+[species_mapper]
+"_YEAST" = "YEAST"
+"_ECOLI" = "ECOLI"
+"_HUMAN" = "HUMAN"
+
+
+[general]
+"contaminant_flag" = "Cont_"
+"decoy_flag" = false
diff --git a/proteobench/io/parsing/io_parse_settings/parse_settings_files.toml b/proteobench/io/parsing/io_parse_settings/parse_settings_files.toml
@@ -7,6 +7,7 @@
 "MSAngel" = "parse_settings_msangel.toml"
 "Sage" = "parse_settings_sage.toml"
 "PEAKS" = "parse_settings_peaks.toml"
+"quantms" = "parse_settings_msstats.toml"
 "Custom" = "parse_settings_custom.toml"
 
 [quant_lfq_peptidoform_DDA]

diff --git a/proteobench/io/parsing/parse_ion.py b/proteobench/io/parsing/parse_ion.py
@@ -1,7 +1,6 @@
 import math
 import os
 import re
-from typing import Dict, List, Optional
 
 import pandas as pd
 
@@ -109,7 +108,18 @@ def load_input_file(input_csv: str, input_format: str) -> pd.DataFrame:
         input_data_frame = pd.read_csv(input_csv, low_memory=False, sep="\t")
     elif input_format == "PEAKS":
         input_data_frame = pd.read_csv(input_csv, low_memory=False, sep=",")
-
+    elif input_format == "quantms":
+        input_data_frame = pd.read_csv(input_csv, low_memory=False)
+        input_data_frame = input_data_frame.assign(
+            proforma=input_data_frame["PeptideSequence"].str.replace(
+                r"\(([^)]+)\)",
+                r"",
+                regex=True,
+            ),
+        )
+        input_data_frame["Sequence"] = input_data_frame["PeptideSequence"].str.replace(r"\(([^)]+)\)", r"", regex=True)
+    else:
+        raise ValueError(f"Input format '{input_format}' not recognized.")
     return input_data_frame
 
 

diff --git a/proteobench/io/parsing/parse_settings.py b/proteobench/io/parsing/parse_settings.py
@@ -171,14 +171,16 @@ def convert_to_standard_format(self, df: pd.DataFrame) -> tuple[pd.DataFrame, Di
                     df_filtered_melted["proforma"] + "|Z=" + df_filtered_melted["Charge"].astype(str)
                 )
             else:
-                print("Not all columns required for making the ion are available.")
+                # ! raise ValueError
+                print("Not all columns required for making the ion are available: 'proforma' and 'Charge'.")
             return df_filtered_melted, replicate_to_raw
 
         elif self.analysis_level == "peptidoform":
             if "proforma" in df_filtered_melted.columns:
                 df_filtered_melted["peptidoform"] = df_filtered_melted["proforma"]
             else:
-                print("Not all columns required for making the peptidoform are available.")
+                # ! raise ValueError
+                print("Not all columns required for making the peptidoform are available: 'proforma'.")
             return df_filtered_melted, replicate_to_raw
 
         else:
@@ -244,11 +246,11 @@ def convert_to_standard_format(self, df: pd.DataFrame) -> tuple[pd.DataFrame, Di
         if self.parser.analysis_level == "ion":
             try:
                 df["precursor ion"] = df["proforma"] + "|Z=" + df["Charge"].astype(str)
-            except KeyError:
+            except KeyError as e:
                 raise KeyError(
                     "Not all columns required for making the ion are available."
                     " Is the charge available in the input file?"
-                )
+                ) from e
 
             return df, replicate_to_raw
 

diff --git a/proteobench/modules/quant/lfq/ion/DDA/quant_lfq_ion_DDA.py b/proteobench/modules/quant/lfq/ion/DDA/quant_lfq_ion_DDA.py
@@ -113,42 +113,45 @@ def benchmarking(
         except pd.errors.ParserError as e:
             raise ParseError(
                 f"Error parsing {input_format} file, please make sure the format is correct and the correct software tool is chosen: {e}"
-            )
+            ) from e
         except Exception as e:
-            raise ParseSettingsError(f"Error parsing the input file: {e}")
+            raise ParseSettingsError("Error parsing the input file.") from e
 
+        msg = f"Folder: {self.parse_settings_dir}, Module: {self.module_id}"
         # Parse settings file
         try:
             parse_settings = ParseSettingsBuilder(
                 parse_settings_dir=self.parse_settings_dir, module_id=self.module_id
             ).build_parser(input_format)
         except KeyError as e:
-            raise ParseSettingsError(f"Error parsing settings file for parsing, settings seem to be missing: {e}")
+            raise ParseSettingsError(
+                f"Error parsing settings file for parsing, settings seem to be missing: {msg}"
+            ) from e
         except FileNotFoundError as e:
-            raise ParseSettingsError(f"Could not find the parsing settings file: {e}")
+            raise ParseSettingsError(f"Could not find the parsing settings file: {msg}") from e
         except Exception as e:
-            raise ParseSettingsError(f"Error parsing settings file for parsing: {e}")
+            raise ParseSettingsError(f"Error parsing settings file for parsing: {msg}") from e
 
         try:
             standard_format, replicate_to_raw = parse_settings.convert_to_standard_format(input_df)
         except KeyError as e:
-            raise ConvertStandardFormatError(f"Error converting to standard format, key missing: {e}")
+            raise ConvertStandardFormatError("Error converting to standard format, key missing.") from e
         except Exception as e:
-            raise ConvertStandardFormatError(f"Error converting to standard format: {e}")
+            raise ConvertStandardFormatError("Error converting to standard format.") from e
 
-        # calculate quantification scores
+        # instantiate quantification scores
         try:
             quant_score = QuantScores(
                 self.precursor_name, parse_settings.species_expected_ratio(), parse_settings.species_dict()
             )
         except Exception as e:
-            raise QuantificationError(f"Error generating quantification scores: {e}")
+            raise QuantificationError("Error generating quantification scores.") from e
 
         # generate intermediate data structure
         try:
             intermediate_data_structure = quant_score.generate_intermediate(standard_format, replicate_to_raw)
         except Exception as e:
-            raise IntermediateFormatGenerationError(f"Error generating intermediate data structure: {e}")
+            raise IntermediateFormatGenerationError("Error generating intermediate data structure.") from e
 
         # try:
         current_datapoint = Datapoint.generate_datapoint(

diff --git a/proteobench/plotting/plot_quant.py b/proteobench/plotting/plot_quant.py
@@ -89,6 +89,7 @@ def plot_metric(
             "MSAID": "#afff57",
             "Proteome Discoverer": "#8c564b",
             "PEAKS": "#f781bf",
+            "quantms": "#03fc39",
         },
         mapping: Dict[str, int] = {"old": 10, "new": 20},
         highlight_color: str = "#d30067",