Merge pull request #10 from bigbio/dev

boad added
bigbio · Jul 30, 2024 · 332589e · 332589e
2 parents bef95d5 + 066773c
commit 332589e
Show file tree

Hide file tree

Showing 9 changed files with 49 additions and 36 deletions.
diff --git a/.github/workflows/conda-build.yml b/.github/workflows/conda-build.yml
@@ -47,4 +47,5 @@ jobs:
       run: |
         conda activate quantmsutils
         quantmsutilsc --help
+        quantmsutilsc --version
       shell: bash -l {0}
diff --git a/environment.yml b/environment.yml
@@ -6,7 +6,7 @@ channels:
   - conda-forge
 dependencies:
   - click
-  - sdrf-pipelines
+  - sdrf-pipelines>=0.0.28
   - pyopenms
   - ms2rescore=3.0.2
   - psm-utils=0.8.0

diff --git a/quantmsutils/__init__.py b/quantmsutils/__init__.py
@@ -0,0 +1 @@
+__version__ = "0.0.3"
diff --git a/quantmsutils/diann/diann2mztab.py b/quantmsutils/diann/diann2mztab.py
@@ -362,7 +362,7 @@ def diann_version(self) -> str:
         return diann_version_id
 
     def validate_diann_version(self) -> None:
-        supported_diann_versions = ["1.8.1"]
+        supported_diann_versions = ["1.8.1", "1.9.beta.1"]
         if self.diann_version not in supported_diann_versions:
             raise ValueError(f"Unsupported DIANN version {self.diann_version}")
 
@@ -732,7 +732,7 @@ def mztab_prh(report, pg, index_ref, database, fasta_df):
 
     logger.debug("Classifying results type ...")
     pg["opt_global_result_type"] = "single_protein"
-    pg.loc[pg["Protein.Ids"].str.contains(";"), "opt_global_result_type"] = (
+    pg.loc[pg["Protein.Group"].str.contains(";"), "opt_global_result_type"] = (
         "indistinguishable_protein_group"
     )
 
@@ -741,7 +741,6 @@ def mztab_prh(report, pg, index_ref, database, fasta_df):
     out_mztab_prh = out_mztab_prh.drop(["Protein.Names"], axis=1)
     out_mztab_prh.rename(
         columns={
-            "Protein.Group": "accession",
             "First.Protein.Description": "description",
         },
         inplace=True,
@@ -762,14 +761,14 @@ def mztab_prh(report, pg, index_ref, database, fasta_df):
 
     logger.debug("Extracting accession values (keeping first)...")
     out_mztab_prh.loc[:, "accession"] = out_mztab_prh.apply(
-        lambda x: x["accession"].split(";")[0], axis=1
+        lambda x: x["Protein.Group"].split(";")[0], axis=1
     )
 
     protein_details_df = out_mztab_prh[
         out_mztab_prh["opt_global_result_type"] == "indistinguishable_protein_group"
     ]
     prh_series = (
-        protein_details_df["Protein.Ids"]
+        protein_details_df["Protein.Group"]
         .str.split(";", expand=True)
         .stack()
         .reset_index(level=1, drop=True)
@@ -806,7 +805,7 @@ def mztab_prh(report, pg, index_ref, database, fasta_df):
     # or out_mztab_PRH.loc[out_mztab_PRH["Protein.Ids"] == out_mztab_PRH["accession"], "ambiguity_members"] = "null"
     out_mztab_prh.loc[:, "ambiguity_members"] = out_mztab_prh.apply(
         lambda x: (
-            x["Protein.Ids"]
+            x["Protein.Group"]
             if x["opt_global_result_type"] == "indistinguishable_protein_group"
             else "null"
         ),
@@ -817,7 +816,7 @@ def mztab_prh(report, pg, index_ref, database, fasta_df):
     score_looker = ModScoreLooker(report)
     out_mztab_prh[["modifiedSequence", "best_search_engine_score[1]"]] = (
         out_mztab_prh.apply(
-            lambda x: score_looker.get_score(x["Protein.Ids"]),
+            lambda x: score_looker.get_score(x["Protein.Group"]),
             axis=1,
             result_type="expand",
         )
@@ -833,19 +832,19 @@ def mztab_prh(report, pg, index_ref, database, fasta_df):
     # This used to be a bottleneck in performance
     # This implementation drops the run time from 57s to 25ms
     protein_agg_report = (
-        report[["PG.MaxLFQ", "Protein.Ids", "study_variable"]]
-        .groupby(["study_variable", "Protein.Ids"])
+        report[["PG.MaxLFQ", "Protein.Group", "study_variable"]]
+        .groupby(["study_variable", "Protein.Group"])
         .agg({"PG.MaxLFQ": ["mean", "std", "sem"]})
         .reset_index()
-        .pivot(columns=["study_variable"], index="Protein.Ids")
+        .pivot(columns=["study_variable"], index="Protein.Group")
         .reset_index()
     )
     protein_agg_report.columns = [
         "::".join([str(s) for s in col]).strip()
         for col in protein_agg_report.columns.values
     ]
     subname_mapper = {
-        "Protein.Ids::::": "Protein.Ids",
+        "Protein.Group::::": "Protein.Group",
         "PG.MaxLFQ::mean": "protein_abundance_study_variable",
         "PG.MaxLFQ::std": "protein_abundance_stdev_study_variable",
         "PG.MaxLFQ::sem": "protein_abundance_std_error_study_variable",
@@ -858,7 +857,7 @@ def mztab_prh(report, pg, index_ref, database, fasta_df):
     # to the Protein.Ids (A0A024RBG1;Q9NZJ9;Q9NZJ9-2), leading to A LOT of missing values.
     out_mztab_prh = out_mztab_prh.merge(
         protein_agg_report,
-        on="Protein.Ids",
+        on="Protein.Group",
         how="left",
         validate="many_to_one",
         copy=True,
@@ -871,7 +870,7 @@ def mztab_prh(report, pg, index_ref, database, fasta_df):
     out_mztab_prh.loc[:, "PRH"] = "PRT"
     index = out_mztab_prh.loc[:, "PRH"]
     out_mztab_prh.drop(
-        ["PRH", "Genes", "modifiedSequence", "Protein.Ids"], axis=1, inplace=True
+        ["PRH", "Genes", "modifiedSequence", "Protein.Group"], axis=1, inplace=True
     )
     out_mztab_prh.insert(0, "PRH", index)
     out_mztab_prh.fillna("null", inplace=True)
@@ -916,14 +915,14 @@ def mztab_peh(
     out_mztab_peh = pd.DataFrame()
     out_mztab_peh = pr.iloc[:, 0:10]
     out_mztab_peh.drop(
-        ["Protein.Group", "Protein.Names", "First.Protein.Description", "Proteotypic"],
+        ["Protein.Ids", "Protein.Names", "First.Protein.Description", "Proteotypic"],
         axis=1,
         inplace=True,
     )
     out_mztab_peh.rename(
         columns={
             "Stripped.Sequence": "sequence",
-            "Protein.Ids": "accession",
+            "Protein.Group": "accession",
             "Modified.Sequence": "opt_global_cv_MS:1000889_peptidoform_sequence",
             "Precursor.Charge": "charge",
         },
@@ -1123,7 +1122,7 @@ def __find_info(directory, n):
     out_mztab_psh = out_mztab_psh[
         [
             "Stripped.Sequence",
-            "Protein.Ids",
+            "Protein.Group",
             "Q.Value",
             "RT.Start",
             "Precursor.Charge",
@@ -1239,7 +1238,7 @@ def classify_result_type(target):
     :return: A string implys protein type
     :rtype: str
     """
-    if ";" in target["Protein.Ids"]:
+    if ";" in target["Protein.Group"]:
         return "indistinguishable_protein_group"
     return "single_protein"
 
@@ -1293,7 +1292,7 @@ def match_in_report(report, target, max_, flag, level):
         return tuple(q_value)
 
     if flag == 1 and level == "protein":
-        result = report[report["Protein.Ids"] == target]
+        result = report[report["Protein.Group"] == target]
         prh_params = []
         for i in range(1, max_ + 1):
             match = result[result["study_variable"] == i]
@@ -1320,9 +1319,9 @@ def __init__(self, report: pd.DataFrame) -> None:
 
     def make_lookup_dict(self, report) -> Dict[str, Tuple[str, float]]:
         grouped_df = (
-            report[["Modified.Sequence", "Protein.Ids", "Global.PG.Q.Value"]]
+            report[["Modified.Sequence", "Protein.Group", "Global.PG.Q.Value"]]
             .sort_values("Global.PG.Q.Value", ascending=True)
-            .groupby(["Protein.Ids"])
+            .groupby(["Protein.Group"])
             .head(1)
         )
         #        Modified.Sequence               Protein.Ids  Global.PG.Q.Value
@@ -1332,7 +1331,7 @@ def make_lookup_dict(self, report) -> Dict[str, Tuple[str, float]]:
         # 103588      NPVGYPLAWQFLR           Q9NZ08;Q9NZ08-2           0.000252
 
         out = {
-            row["Protein.Ids"]: (row["Modified.Sequence"], row["Global.PG.Q.Value"])
+            row["Protein.Group"]: (row["Modified.Sequence"], row["Global.PG.Q.Value"])
             for _, row in grouped_df.iterrows()
         }
         return out
@@ -1578,17 +1577,17 @@ def calculate_protein_coverages(
     protein in the PRH table (defined by accession, not protein.ids).
     """
     nested_df = (
-        report[["Protein.Ids", "Stripped.Sequence"]]
-        .groupby("Protein.Ids")
+        report[["Protein.Group", "Stripped.Sequence"]]
+        .groupby("Protein.Group")
         .agg({"Stripped.Sequence": set})
         .reset_index()
     )
     #                      Protein.Ids                                  Stripped.Sequence
     # 0     A0A024RBG1;Q9NZJ9;Q9NZJ9-2                                   {SEQEDEVLLVSSSR}
     # 1        A0A096LP49;A0A096LP49-2                                  {SPWAMTERKHSSLER}
     # 2                A0AVT1;A0AVT1-2  {EDFTLLDFINAVK, KPDHVPISSEDER, QDVIITALDNVEAR,...
-    ids_to_seqs = dict(zip(nested_df["Protein.Ids"], nested_df["Stripped.Sequence"]))
-    acc_to_ids = dict(zip(out_mztab_prh["accession"], out_mztab_prh["Protein.Ids"]))
+    ids_to_seqs = dict(zip(nested_df["Protein.Group"], nested_df["Stripped.Sequence"]))
+    acc_to_ids = dict(zip(out_mztab_prh["accession"], out_mztab_prh["Protein.Group"]))
     fasta_id_to_seqs = dict(zip(fasta_df["id"], fasta_df["seq"]))
     acc_to_fasta_ids: dict = {}
 

diff --git a/quantmsutils/psm/psm_conversion.py b/quantmsutils/psm/psm_conversion.py
@@ -15,7 +15,7 @@
     "modifications",
     "retention_time",
     "charge",
-    "calc_mass_to_charge",
+    "exp_mass_to_charge",
     "reference_file_name",
     "scan_number",
     "peptidoform",
@@ -33,6 +33,8 @@
 
 
 def mods_position(peptide):
+    if peptide.startswith("."):
+        peptide = peptide[1:]
     pattern = re.compile(r"\((.*?)\)")
     original_mods = pattern.findall(peptide)
     peptide = re.sub(r"\(.*?\)", ".", peptide)
@@ -95,7 +97,7 @@ def convert_psm(ctx, idxml: str, spectra_file: str, export_decoy_psm: bool = Fal
 
     for peptide_id in pep_ids:
         retention_time = peptide_id.getRT()
-        calc_mass_to_charge = peptide_id.getMZ()
+        exp_mass_to_charge = peptide_id.getMZ()
         scan_number = int(
             re.findall(
                 r"(spectrum|scan)=(\d+)", peptide_id.getMetaValue("spectrum_reference")
@@ -153,7 +155,7 @@ def convert_psm(ctx, idxml: str, spectra_file: str, export_decoy_psm: bool = Fal
                     modifications,
                     retention_time,
                     charge,
-                    calc_mass_to_charge,
+                    exp_mass_to_charge,
                     reference_file_name,
                     scan_number,
                     peptidoform,

diff --git a/quantmsutils/quantmsutilsc.py b/quantmsutils/quantmsutilsc.py
@@ -8,27 +8,24 @@
 from quantmsutils.rescoring.ms2rescore import ms2rescore
 from quantmsutils.sdrf.check_samplesheet import check_samplesheet
 from quantmsutils.sdrf.extract_sample import extract_sample_from_expdesign
+from quantmsutils import __version__
 
 CONTEXT_SETTINGS = dict(help_option_names=["-h", "--help"])
 
 
+@click.version_option(version=__version__, package_name="quantmsutils", message="%(package)s %(version)s")
 @click.group(context_settings=CONTEXT_SETTINGS)
 def cli():
     pass
 
 
 cli.add_command(dianncfg)
 cli.add_command(diann2mztab)
-
 cli.add_command(add_sage_feature)
-
 cli.add_command(mzml_statistics)
-
 cli.add_command(extract_sample_from_expdesign)
 cli.add_command(check_samplesheet)
-
 cli.add_command(ms2rescore)
-
 cli.add_command(convert_psm)
 
 

diff --git a/quantmsutils/rescoring/ms2rescore.py b/quantmsutils/rescoring/ms2rescore.py
@@ -19,6 +19,7 @@
 def parse_cli_arguments_to_config(
     config_file: str = None,
     feature_generators: str = None,
+    ms2pip_model_dir: str = None,
     ms2pip_model: str = None,
     ms2_tolerance: float = None,
     calibration_set_size: float = None,
@@ -46,6 +47,7 @@ def parse_cli_arguments_to_config(
             config["ms2rescore"]["feature_generators"]["basic"] = {}
         if "ms2pip" in feature_generators_list:
             config["ms2rescore"]["feature_generators"]["ms2pip"] = {
+                "model_dir": ms2pip_model_dir,
                 "model": ms2pip_model,
                 "ms2_tolerance": ms2_tolerance,
             }
@@ -76,6 +78,8 @@ def parse_cli_arguments_to_config(
                 "Percolator rescoring engine has been specified. Use the idXML containing rescoring features and run Percolator in a separate step."
             )
 
+    if ms2pip_model_dir is not None:
+        config["ms2rescore"]["ms2pip_model_dir"] = ms2pip_model_dir
     if ms2pip_model is not None:
         config["ms2rescore"]["ms2pip_model"] = ms2pip_model
     if ms2_tolerance is not None:
@@ -216,6 +220,13 @@ def filter_out_artifact_psms(
     type=str,
     default="Immuno-HCD",
 )
+@click.option(
+    "-md",
+    "--ms2pip_model_dir",
+    help="The path of MS²PIP model (default: `./`)",
+    type=str,
+    default="./"
+)
 @click.option(
     "-ms2tol",
     "--ms2_tolerance",
@@ -271,6 +282,7 @@ def ms2rescore(
     fasta_file,
     test_fdr,
     feature_generators,
+    ms2pip_model_dir,
     ms2pip_model,
     ms2_tolerance,
     calibration_set_size,
@@ -318,6 +330,7 @@ def ms2rescore(
         config_file=config_file,
         output_path=output_path,
         feature_generators=feature_generators,
+        ms2pip_model_dir=ms2pip_model_dir,
         ms2pip_model=ms2pip_model,
         processes=processes,
         ms2_tolerance=ms2_tolerance,

diff --git a/requirements.txt b/requirements.txt
@@ -1,5 +1,5 @@
 click
-sdrf-pipelines
+sdrf-pipelines==0.0.28
 pyopenms
 ms2rescore==3.0.2
 psm-utils==0.8.0

diff --git a/setup.py b/setup.py
@@ -30,7 +30,7 @@
 
 INSTALL_REQUIRES = [
     "click",
-    "sdrf-pipelines",
+    "sdrf-pipelines==0.0.28",
     "pyopenms",
     "ms2rescore==3.0.2",
     "psm-utils==0.8.0",