Merge pull request #6 from bigbio/dev

adding big tests for ms2rescore
bigbio · May 29, 2024 · d8da3c4 · d8da3c4
2 parents 75fb7cf + 30f204b
commit d8da3c4
Show file tree

Hide file tree

Showing 11 changed files with 127 additions and 95 deletions.
diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml
@@ -38,3 +38,14 @@ jobs:
     - name: Test with pytest
       run: |
         pytest
+    - name: Download test data
+      run: |
+        wget https://ftp.pride.ebi.ac.uk/pub/databases/pride/resources/proteomes/quantms-ci-github/quantms-utils/TMT_Erwinia_1uLSike_Top10HCD_isol2_45stepped_60min_01.mzML
+        wget https://ftp.pride.ebi.ac.uk/pub/databases/pride/resources/proteomes/quantms-ci-github/quantms-utils/TMT_Erwinia_1uLSike_Top10HCD_isol2_45stepped_60min_01_comet.idXML
+    - name: Test percolator ms2rescore
+      run: |
+        quantmsutilsc ms2rescore -p TMT_Erwinia_1uLSike_Top10HCD_isol2_45stepped_60min_01_comet.idXML -s TMT_Erwinia_1uLSike_Top10HCD_isol2_45stepped_60min_01.mzML -n 2 -pipm "HCD2021" -fg "ms2pip,deeplc" --id_decoy_pattern "^rev" -t 0.05
+#    - name: Test makapot ms2rescore
+#      run: |
+#        quantmsutilsc ms2rescore -p TMT_Erwinia_1uLSike_Top10HCD_isol2_45stepped_60min_01_comet.idXML -s TMT_Erwinia_1uLSike_Top10HCD_isol2_45stepped_60min_01.mzML -n 2 -pipm "HCD2021" -fg "ms2pip,deeplc" --id_decoy_pattern "^rev" -t 0.05 --rescoring_engine "mokapot"
+#
diff --git a/environment.yml b/environment.yml
@@ -13,6 +13,7 @@ dependencies:
   - pydantic
   - pandas
   - protobuf>=3.9.2,<4  # Ensure compatibility with tensorboard
+  - numpy
   - pip
   - pip:
     - tensorboard
diff --git a/quantmsutils/diann/diann2mztab.py b/quantmsutils/diann/diann2mztab.py
@@ -44,14 +44,14 @@
 @click.option("--qvalue_threshold", "-q", type=float)
 @click.pass_context
 def diann2mztab(
-    ctx,
-    folder,
-    exp_design,
-    dia_params,
-    diann_version,
-    charge,
-    missed_cleavages,
-    qvalue_threshold,
+        ctx,
+        folder,
+        exp_design,
+        dia_params,
+        diann_version,
+        charge,
+        missed_cleavages,
+        qvalue_threshold,
 ):
     """
     Convert DIA-NN output to MSstats, Triqler or mzTab.
@@ -61,7 +61,7 @@ def diann2mztab(
     :param folder: DiannConvert specifies the folder where the required file resides. The folder contains
         the DiaNN main report, protein matrix, precursor matrix, experimental design file, protein sequence
         FASTA file, version file of DiaNN and ms_info TSVs
-    ?:param exp_design: Experimental design file
+    :param exp_design: Experimental design file
     :param dia_params: A list contains DIA parameters
     :type dia_params: list
     :param diann_version: Path to a version file of DIA-NN
@@ -228,7 +228,7 @@ def get_exp_design_dfs(exp_design_file):
             lambda x: _true_stem(x["Spectra_Filepath"]), axis=1
         )
 
-        s_table = [i.replace("\n", "").split("\t") for i in data[empty_row + 1 :]][1:]
+        s_table = [i.replace("\n", "").split("\t") for i in data[empty_row + 1:]][1:]
         s_header = data[empty_row + 1].replace("\n", "").split("\t")
         s_data_frame = pd.DataFrame(s_table, columns=s_header)
 
@@ -265,31 +265,31 @@ def compute_mass_modified_peptide(peptide_seq: str) -> float:
         if aa in aa_mass and not_mod:
             aa = aa_mass[aa]
         elif (
-            aa
-            not in [
-                "G",
-                "A",
-                "V",
-                "L",
-                "I",
-                "F",
-                "M",
-                "P",
-                "W",
-                "S",
-                "C",
-                "T",
-                "Y",
-                "N",
-                "Q",
-                "D",
-                "E",
-                "K",
-                "R",
-                "H",
-            ]
-            and not_mod
-            and aa != ")"
+                aa
+                not in [
+                    "G",
+                    "A",
+                    "V",
+                    "L",
+                    "I",
+                    "F",
+                    "M",
+                    "P",
+                    "W",
+                    "S",
+                    "C",
+                    "T",
+                    "Y",
+                    "N",
+                    "Q",
+                    "D",
+                    "E",
+                    "K",
+                    "R",
+                    "H",
+                ]
+                and not_mod
+                and aa != ")"
         ):
             logger.info(f"Unknown amino acid with mass not known:{aa}")
         peptide_parts.append(aa)
@@ -367,13 +367,13 @@ def validate_diann_version(self) -> None:
             raise ValueError(f"Unsupported DIANN version {self.diann_version}")
 
     def convert_to_mztab(
-        self,
-        report,
-        f_table,
-        charge: int,
-        missed_cleavages: int,
-        dia_params: List[Any],
-        out: os.PathLike,
+            self,
+            report,
+            f_table,
+            charge: int,
+            missed_cleavages: int,
+            dia_params: List[Any],
+            out: Union[os.PathLike, str],
     ) -> None:
         logger.info("Converting to mzTab")
         self.validate_diann_version()
@@ -481,8 +481,8 @@ def main_report_df(self, qvalue_threshold: float) -> pd.DataFrame:
         }
         mass_vector = report["Modified.Sequence"].map(uniq_masses)
         report["Calculate.Precursor.Mz"] = (
-            mass_vector + (PROTON_MASS_U * report["Precursor.Charge"])
-        ) / report["Precursor.Charge"]
+                                                   mass_vector + (PROTON_MASS_U * report["Precursor.Charge"])
+                                           ) / report["Precursor.Charge"]
 
         logger.debug("Indexing Precursors")
         # Making the map is 1500x faster
@@ -589,16 +589,16 @@ def mztab_mtd(index_ref, dia_params, fasta, charge, missed_cleavages):
     out_mztab_mtd.loc[1, "software[1]-setting[1]"] = fasta
     out_mztab_mtd.loc[1, "software[1]-setting[2]"] = "db_version:null"
     out_mztab_mtd.loc[1, "software[1]-setting[3]"] = (
-        "fragment_mass_tolerance:" + fragment_mass_tolerance
+            "fragment_mass_tolerance:" + fragment_mass_tolerance
     )
     out_mztab_mtd.loc[1, "software[1]-setting[4]"] = (
-        "fragment_mass_tolerance_unit:" + fragment_mass_tolerance_unit
+            "fragment_mass_tolerance_unit:" + fragment_mass_tolerance_unit
     )
     out_mztab_mtd.loc[1, "software[1]-setting[5]"] = (
-        "precursor_mass_tolerance:" + precursor_mass_tolerance
+            "precursor_mass_tolerance:" + precursor_mass_tolerance
     )
     out_mztab_mtd.loc[1, "software[1]-setting[6]"] = (
-        "precursor_mass_tolerance_unit:" + precursor_mass_tolerance_unit
+            "precursor_mass_tolerance_unit:" + precursor_mass_tolerance_unit
     )
     out_mztab_mtd.loc[1, "software[1]-setting[7]"] = "enzyme:" + enzyme
     out_mztab_mtd.loc[1, "software[1]-setting[8]"] = "enzyme_term_specificity:full"
@@ -607,10 +607,10 @@ def mztab_mtd(index_ref, dia_params, fasta, charge, missed_cleavages):
         missed_cleavages
     )
     out_mztab_mtd.loc[1, "software[1]-setting[11]"] = (
-        "fixed_modifications:" + fixed_modifications
+            "fixed_modifications:" + fixed_modifications
     )
     out_mztab_mtd.loc[1, "software[1]-setting[12]"] = (
-        "variable_modifications:" + variable_modifications
+            "variable_modifications:" + variable_modifications
     )
 
     (fixed_mods, variable_mods, fix_flag, var_flag) = mtd_mod_info(
@@ -633,7 +633,7 @@ def mztab_mtd(index_ref, dia_params, fasta, charge, missed_cleavages):
             ]
             out_mztab_mtd.loc[1, "variable_mod[" + str(i) + "]-site"] = variable_mods[
                 i - 1
-            ][1]
+                ][1]
             out_mztab_mtd.loc[1, "variable_mod[" + str(i) + "]-position"] = "Anywhere"
     else:
         out_mztab_mtd.loc[1, "variable_mod[1]"] = variable_mods[0]
@@ -649,8 +649,8 @@ def mztab_mtd(index_ref, dia_params, fasta, charge, missed_cleavages):
             "[MS, MS:1000584, mzML file, ]"
         )
         out_mztab_mtd.loc[1, "ms_run[" + str(i) + "]-location"] = (
-            "file://"
-            + index_ref[index_ref["ms_run"] == i]["Spectra_Filepath"].values[0]
+                "file://"
+                + index_ref[index_ref["ms_run"] == i]["Spectra_Filepath"].values[0]
         )
         out_mztab_mtd.loc[1, "ms_run[" + str(i) + "]-id_format"] = (
             "[MS, MS:1000777, spectrum identifier nativeID format, ]"
@@ -659,7 +659,7 @@ def mztab_mtd(index_ref, dia_params, fasta, charge, missed_cleavages):
             "[MS, MS:1002038, unlabeled sample, ]"
         )
         out_mztab_mtd.loc[1, "assay[" + str(i) + "]-ms_run_ref"] = (
-            "ms_run[" + str(i) + "]"
+                "ms_run[" + str(i) + "]"
         )
 
     with warnings.catch_warnings():
@@ -723,9 +723,9 @@ def mztab_prh(report, pg, index_ref, database, fasta_df):
     col = {}
     for i in file:
         col[i] = (
-            "protein_abundance_assay["
-            + str(index_ref[index_ref["Run"] == _true_stem(i)]["ms_run"].values[0])
-            + "]"
+                "protein_abundance_assay["
+                + str(index_ref[index_ref["Run"] == _true_stem(i)]["ms_run"].values[0])
+                + "]"
         )
 
     pg.rename(columns=col, inplace=True)
@@ -767,7 +767,7 @@ def mztab_prh(report, pg, index_ref, database, fasta_df):
 
     protein_details_df = out_mztab_prh[
         out_mztab_prh["opt_global_result_type"] == "indistinguishable_protein_group"
-    ]
+        ]
     prh_series = (
         protein_details_df["Protein.Ids"]
         .str.split(";", expand=True)
@@ -884,11 +884,11 @@ def mztab_prh(report, pg, index_ref, database, fasta_df):
 
 
 def mztab_peh(
-    report: pd.DataFrame,
-    pr: pd.DataFrame,
-    precursor_list: List[str],
-    index_ref: pd.DataFrame,
-    database: os.PathLike,
+        report: pd.DataFrame,
+        pr: pd.DataFrame,
+        precursor_list: List[str],
+        index_ref: pd.DataFrame,
+        database: os.PathLike,
 ) -> pd.DataFrame:
     """
     Construct PEH sub-table.
@@ -1106,8 +1106,8 @@ def __find_info(directory, n):
         # Standardize spectrum identifier format for bruker data
         if not isinstance(target.loc[0, "opt_global_spectrum_reference"], str):
             target.loc[:, "opt_global_spectrum_reference"] = "scan=" + target.loc[
-                :, "opt_global_spectrum_reference"
-            ].astype(str)
+                                                                       :, "opt_global_spectrum_reference"
+                                                                       ].astype(str)
 
         # TODO seconds returned from precursor.getRT()
         target.loc[:, "RT.Start"] = target.apply(lambda x: x["RT.Start"] / 60, axis=1)
@@ -1184,7 +1184,7 @@ def __find_info(directory, n):
 
     out_mztab_psh.loc[:, "spectra_ref"] = out_mztab_psh.apply(
         lambda x: "ms_run[{}]:".format(x["ms_run"])
-        + x["opt_global_spectrum_reference"],
+                  + x["opt_global_spectrum_reference"],
         axis=1,
         result_type="expand",
     )
@@ -1556,8 +1556,8 @@ def calculate_coverage(ref_sequence: str, sequences: Set[str]):
     for start, length in sorted(zip(starts, lengths)):
         if merged_starts and merged_starts[-1] + merged_lengths[-1] >= start:
             merged_lengths[-1] = (
-                max(merged_starts[-1] + merged_lengths[-1], start + length)
-                - merged_starts[-1]
+                    max(merged_starts[-1] + merged_lengths[-1], start + length)
+                    - merged_starts[-1]
             )
         else:
             merged_starts.append(start)
@@ -1569,7 +1569,7 @@ def calculate_coverage(ref_sequence: str, sequences: Set[str]):
 
 
 def calculate_protein_coverages(
-    report: pd.DataFrame, out_mztab_prh: pd.DataFrame, fasta_df: pd.DataFrame
+        report: pd.DataFrame, out_mztab_prh: pd.DataFrame, fasta_df: pd.DataFrame
 ) -> List[str]:
     """Calculates protein coverages for the PRH table.
 

diff --git a/quantmsutils/diann/dianncfg.py b/quantmsutils/diann/dianncfg.py
@@ -52,6 +52,7 @@ def convert_mod(unimod_database, fix_mod: str, var_mod: str) -> Tuple[List, List
     if fix_mod != "":
         for mod in fix_mod.split(","):
             tag = 0
+            diann_mod = None
             for modification in unimod_database.modifications:
                 if modification.get_name() == mod.split(" ")[0]:
                     diann_mod = (
@@ -80,12 +81,18 @@ def convert_mod(unimod_database, fix_mod: str, var_mod: str) -> Tuple[List, List
                 or "mTRAQ" in diann_mod
             ):
                 fix_ptm.append(diann_mod + "," + site + "," + "label")
-            else:
+            elif diann_mod is not None:
                 fix_ptm.append(diann_mod + "," + site)
+            else:
+                print(
+                    "Warning: Currently only supported unimod modifications for DIA pipeline. Skipped: "
+                    + mod
+                )
 
     if var_mod != "":
         for mod in var_mod.split(","):
             tag = 0
+            diann_mod = None
             for modification in unimod_database.modifications:
                 if modification.get_name() == mod.split(" ")[0]:
                     diann_mod = (

diff --git a/quantmsutils/mzml/mzml_statistics.py b/quantmsutils/mzml/mzml_statistics.py
@@ -22,6 +22,7 @@ def mzml_statistics(ctx, ms_path: str, id_only: bool = False) -> None:
     # Command line usage example
     python script_name.py mzml_statistics --ms_path "path/to/file.mzML"
 
+    :param ctx: Click context
     :param ms_path: A string specifying the path to the mass spectrometry file.
     :param id_only: A boolean flag that, when set to True, generates a CSV file containing only the spectrum ID and
     peaks data for MS level 2 spectra.

diff --git a/quantmsutils/quantmsutilsc.py b/quantmsutils/quantmsutilsc.py
@@ -31,5 +31,11 @@ def cli():
 
 cli.add_command(convert_psm)
 
+def main():
+    try:
+        cli()
+    except SystemExit as e:
+        if e.code != 0:
+            raise
 if __name__ == "__main__":
-    cli()
+    main()