From a8a5fd5c3fa1006b5821a63c35b9757b5cb3aaf4 Mon Sep 17 00:00:00 2001 From: ypriverol Date: Tue, 30 Jul 2024 13:17:37 +0100 Subject: [PATCH 1/5] boad added --- environment.yml | 2 +- requirements.txt | 2 +- setup.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/environment.yml b/environment.yml index 4000cac..2f4678f 100644 --- a/environment.yml +++ b/environment.yml @@ -6,7 +6,7 @@ channels: - conda-forge dependencies: - click - - sdrf-pipelines + - sdrf-pipelines => 0.0.28 - pyopenms - ms2rescore=3.0.2 - psm-utils=0.8.0 diff --git a/requirements.txt b/requirements.txt index d4e5679..14dfebb 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ click -sdrf-pipelines +sdrf-pipelines>=0.0.28 pyopenms ms2rescore==3.0.2 psm-utils==0.8.0 diff --git a/setup.py b/setup.py index ab3eefd..712e096 100644 --- a/setup.py +++ b/setup.py @@ -30,7 +30,7 @@ INSTALL_REQUIRES = [ "click", - "sdrf-pipelines", + "sdrf-pipelines=>0.0.28", "pyopenms", "ms2rescore==3.0.2", "psm-utils==0.8.0", From 4b35a1813ab4ada57454b7ba69c7d5bd7b20571c Mon Sep 17 00:00:00 2001 From: ypriverol Date: Tue, 30 Jul 2024 13:23:16 +0100 Subject: [PATCH 2/5] boad added --- environment.yml | 2 +- requirements.txt | 2 +- setup.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/environment.yml b/environment.yml index 2f4678f..5d3f13b 100644 --- a/environment.yml +++ b/environment.yml @@ -6,7 +6,7 @@ channels: - conda-forge dependencies: - click - - sdrf-pipelines => 0.0.28 + - sdrf-pipelines>=0.0.28 - pyopenms - ms2rescore=3.0.2 - psm-utils=0.8.0 diff --git a/requirements.txt b/requirements.txt index 14dfebb..443ea1c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ click -sdrf-pipelines>=0.0.28 +sdrf-pipelines==0.0.28 pyopenms ms2rescore==3.0.2 psm-utils==0.8.0 diff --git a/setup.py b/setup.py index 712e096..68f6e2d 100644 --- a/setup.py +++ b/setup.py @@ -30,7 +30,7 @@ INSTALL_REQUIRES = [ "click", - "sdrf-pipelines=>0.0.28", + "sdrf-pipelines==0.0.28", "pyopenms", "ms2rescore==3.0.2", "psm-utils==0.8.0", From eb43e30644a05ea41b3cdfe196a296fb57ccdf17 Mon Sep 17 00:00:00 2001 From: Chengxin Dai <37200167+daichengxin@users.noreply.github.com> Date: Tue, 30 Jul 2024 20:52:03 +0800 Subject: [PATCH 3/5] sync bin scripts --- quantmsutils/diann/diann2mztab.py | 49 ++++++++++++++-------------- quantmsutils/psm/psm_conversion.py | 8 +++-- quantmsutils/rescoring/ms2rescore.py | 13 ++++++++ 3 files changed, 42 insertions(+), 28 deletions(-) diff --git a/quantmsutils/diann/diann2mztab.py b/quantmsutils/diann/diann2mztab.py index 8c1ee77..e84a31d 100644 --- a/quantmsutils/diann/diann2mztab.py +++ b/quantmsutils/diann/diann2mztab.py @@ -362,7 +362,7 @@ def diann_version(self) -> str: return diann_version_id def validate_diann_version(self) -> None: - supported_diann_versions = ["1.8.1"] + supported_diann_versions = ["1.8.1", "1.9.beta.1"] if self.diann_version not in supported_diann_versions: raise ValueError(f"Unsupported DIANN version {self.diann_version}") @@ -732,7 +732,7 @@ def mztab_prh(report, pg, index_ref, database, fasta_df): logger.debug("Classifying results type ...") pg["opt_global_result_type"] = "single_protein" - pg.loc[pg["Protein.Ids"].str.contains(";"), "opt_global_result_type"] = ( + pg.loc[pg["Protein.Group"].str.contains(";"), "opt_global_result_type"] = ( "indistinguishable_protein_group" ) @@ -741,7 +741,6 @@ def mztab_prh(report, pg, index_ref, database, fasta_df): out_mztab_prh = out_mztab_prh.drop(["Protein.Names"], axis=1) out_mztab_prh.rename( columns={ - "Protein.Group": "accession", "First.Protein.Description": "description", }, inplace=True, @@ -762,14 +761,14 @@ def mztab_prh(report, pg, index_ref, database, fasta_df): logger.debug("Extracting accession values (keeping first)...") out_mztab_prh.loc[:, "accession"] = out_mztab_prh.apply( - lambda x: x["accession"].split(";")[0], axis=1 + lambda x: x["Protein.Group"].split(";")[0], axis=1 ) protein_details_df = out_mztab_prh[ out_mztab_prh["opt_global_result_type"] == "indistinguishable_protein_group" ] prh_series = ( - protein_details_df["Protein.Ids"] + protein_details_df["Protein.Group"] .str.split(";", expand=True) .stack() .reset_index(level=1, drop=True) @@ -806,7 +805,7 @@ def mztab_prh(report, pg, index_ref, database, fasta_df): # or out_mztab_PRH.loc[out_mztab_PRH["Protein.Ids"] == out_mztab_PRH["accession"], "ambiguity_members"] = "null" out_mztab_prh.loc[:, "ambiguity_members"] = out_mztab_prh.apply( lambda x: ( - x["Protein.Ids"] + x["Protein.Group"] if x["opt_global_result_type"] == "indistinguishable_protein_group" else "null" ), @@ -817,7 +816,7 @@ def mztab_prh(report, pg, index_ref, database, fasta_df): score_looker = ModScoreLooker(report) out_mztab_prh[["modifiedSequence", "best_search_engine_score[1]"]] = ( out_mztab_prh.apply( - lambda x: score_looker.get_score(x["Protein.Ids"]), + lambda x: score_looker.get_score(x["Protein.Group"]), axis=1, result_type="expand", ) @@ -833,11 +832,11 @@ def mztab_prh(report, pg, index_ref, database, fasta_df): # This used to be a bottleneck in performance # This implementation drops the run time from 57s to 25ms protein_agg_report = ( - report[["PG.MaxLFQ", "Protein.Ids", "study_variable"]] - .groupby(["study_variable", "Protein.Ids"]) + report[["PG.MaxLFQ", "Protein.Group", "study_variable"]] + .groupby(["study_variable", "Protein.Group"]) .agg({"PG.MaxLFQ": ["mean", "std", "sem"]}) .reset_index() - .pivot(columns=["study_variable"], index="Protein.Ids") + .pivot(columns=["study_variable"], index="Protein.Group") .reset_index() ) protein_agg_report.columns = [ @@ -845,7 +844,7 @@ def mztab_prh(report, pg, index_ref, database, fasta_df): for col in protein_agg_report.columns.values ] subname_mapper = { - "Protein.Ids::::": "Protein.Ids", + "Protein.Group::::": "Protein.Group", "PG.MaxLFQ::mean": "protein_abundance_study_variable", "PG.MaxLFQ::std": "protein_abundance_stdev_study_variable", "PG.MaxLFQ::sem": "protein_abundance_std_error_study_variable", @@ -858,7 +857,7 @@ def mztab_prh(report, pg, index_ref, database, fasta_df): # to the Protein.Ids (A0A024RBG1;Q9NZJ9;Q9NZJ9-2), leading to A LOT of missing values. out_mztab_prh = out_mztab_prh.merge( protein_agg_report, - on="Protein.Ids", + on="Protein.Group", how="left", validate="many_to_one", copy=True, @@ -871,7 +870,7 @@ def mztab_prh(report, pg, index_ref, database, fasta_df): out_mztab_prh.loc[:, "PRH"] = "PRT" index = out_mztab_prh.loc[:, "PRH"] out_mztab_prh.drop( - ["PRH", "Genes", "modifiedSequence", "Protein.Ids"], axis=1, inplace=True + ["PRH", "Genes", "modifiedSequence", "Protein.Group"], axis=1, inplace=True ) out_mztab_prh.insert(0, "PRH", index) out_mztab_prh.fillna("null", inplace=True) @@ -916,14 +915,14 @@ def mztab_peh( out_mztab_peh = pd.DataFrame() out_mztab_peh = pr.iloc[:, 0:10] out_mztab_peh.drop( - ["Protein.Group", "Protein.Names", "First.Protein.Description", "Proteotypic"], + ["Protein.Ids", "Protein.Names", "First.Protein.Description", "Proteotypic"], axis=1, inplace=True, ) out_mztab_peh.rename( columns={ "Stripped.Sequence": "sequence", - "Protein.Ids": "accession", + "Protein.Group": "accession", "Modified.Sequence": "opt_global_cv_MS:1000889_peptidoform_sequence", "Precursor.Charge": "charge", }, @@ -1123,7 +1122,7 @@ def __find_info(directory, n): out_mztab_psh = out_mztab_psh[ [ "Stripped.Sequence", - "Protein.Ids", + "Protein.Group", "Q.Value", "RT.Start", "Precursor.Charge", @@ -1239,7 +1238,7 @@ def classify_result_type(target): :return: A string implys protein type :rtype: str """ - if ";" in target["Protein.Ids"]: + if ";" in target["Protein.Group"]: return "indistinguishable_protein_group" return "single_protein" @@ -1293,7 +1292,7 @@ def match_in_report(report, target, max_, flag, level): return tuple(q_value) if flag == 1 and level == "protein": - result = report[report["Protein.Ids"] == target] + result = report[report["Protein.Group"] == target] prh_params = [] for i in range(1, max_ + 1): match = result[result["study_variable"] == i] @@ -1320,9 +1319,9 @@ def __init__(self, report: pd.DataFrame) -> None: def make_lookup_dict(self, report) -> Dict[str, Tuple[str, float]]: grouped_df = ( - report[["Modified.Sequence", "Protein.Ids", "Global.PG.Q.Value"]] + report[["Modified.Sequence", "Protein.Group", "Global.PG.Q.Value"]] .sort_values("Global.PG.Q.Value", ascending=True) - .groupby(["Protein.Ids"]) + .groupby(["Protein.Group"]) .head(1) ) # Modified.Sequence Protein.Ids Global.PG.Q.Value @@ -1332,7 +1331,7 @@ def make_lookup_dict(self, report) -> Dict[str, Tuple[str, float]]: # 103588 NPVGYPLAWQFLR Q9NZ08;Q9NZ08-2 0.000252 out = { - row["Protein.Ids"]: (row["Modified.Sequence"], row["Global.PG.Q.Value"]) + row["Protein.Group"]: (row["Modified.Sequence"], row["Global.PG.Q.Value"]) for _, row in grouped_df.iterrows() } return out @@ -1578,8 +1577,8 @@ def calculate_protein_coverages( protein in the PRH table (defined by accession, not protein.ids). """ nested_df = ( - report[["Protein.Ids", "Stripped.Sequence"]] - .groupby("Protein.Ids") + report[["Protein.Group", "Stripped.Sequence"]] + .groupby("Protein.Group") .agg({"Stripped.Sequence": set}) .reset_index() ) @@ -1587,8 +1586,8 @@ def calculate_protein_coverages( # 0 A0A024RBG1;Q9NZJ9;Q9NZJ9-2 {SEQEDEVLLVSSSR} # 1 A0A096LP49;A0A096LP49-2 {SPWAMTERKHSSLER} # 2 A0AVT1;A0AVT1-2 {EDFTLLDFINAVK, KPDHVPISSEDER, QDVIITALDNVEAR,... - ids_to_seqs = dict(zip(nested_df["Protein.Ids"], nested_df["Stripped.Sequence"])) - acc_to_ids = dict(zip(out_mztab_prh["accession"], out_mztab_prh["Protein.Ids"])) + ids_to_seqs = dict(zip(nested_df["Protein.Group"], nested_df["Stripped.Sequence"])) + acc_to_ids = dict(zip(out_mztab_prh["accession"], out_mztab_prh["Protein.Group"])) fasta_id_to_seqs = dict(zip(fasta_df["id"], fasta_df["seq"])) acc_to_fasta_ids: dict = {} diff --git a/quantmsutils/psm/psm_conversion.py b/quantmsutils/psm/psm_conversion.py index 0437c32..30b32c6 100644 --- a/quantmsutils/psm/psm_conversion.py +++ b/quantmsutils/psm/psm_conversion.py @@ -15,7 +15,7 @@ "modifications", "retention_time", "charge", - "calc_mass_to_charge", + "exp_mass_to_charge", "reference_file_name", "scan_number", "peptidoform", @@ -33,6 +33,8 @@ def mods_position(peptide): + if peptide.startswith("."): + peptide = peptide[1:] pattern = re.compile(r"\((.*?)\)") original_mods = pattern.findall(peptide) peptide = re.sub(r"\(.*?\)", ".", peptide) @@ -95,7 +97,7 @@ def convert_psm(ctx, idxml: str, spectra_file: str, export_decoy_psm: bool = Fal for peptide_id in pep_ids: retention_time = peptide_id.getRT() - calc_mass_to_charge = peptide_id.getMZ() + exp_mass_to_charge = peptide_id.getMZ() scan_number = int( re.findall( r"(spectrum|scan)=(\d+)", peptide_id.getMetaValue("spectrum_reference") @@ -153,7 +155,7 @@ def convert_psm(ctx, idxml: str, spectra_file: str, export_decoy_psm: bool = Fal modifications, retention_time, charge, - calc_mass_to_charge, + exp_mass_to_charge, reference_file_name, scan_number, peptidoform, diff --git a/quantmsutils/rescoring/ms2rescore.py b/quantmsutils/rescoring/ms2rescore.py index 6633eda..33dab88 100644 --- a/quantmsutils/rescoring/ms2rescore.py +++ b/quantmsutils/rescoring/ms2rescore.py @@ -19,6 +19,7 @@ def parse_cli_arguments_to_config( config_file: str = None, feature_generators: str = None, + ms2pip_model_dir: str = None, ms2pip_model: str = None, ms2_tolerance: float = None, calibration_set_size: float = None, @@ -46,6 +47,7 @@ def parse_cli_arguments_to_config( config["ms2rescore"]["feature_generators"]["basic"] = {} if "ms2pip" in feature_generators_list: config["ms2rescore"]["feature_generators"]["ms2pip"] = { + "model_dir": ms2pip_model_dir, "model": ms2pip_model, "ms2_tolerance": ms2_tolerance, } @@ -76,6 +78,8 @@ def parse_cli_arguments_to_config( "Percolator rescoring engine has been specified. Use the idXML containing rescoring features and run Percolator in a separate step." ) + if ms2pip_model_dir is not None: + config["ms2rescore"]["ms2pip_model_dir"] = ms2pip_model_dir if ms2pip_model is not None: config["ms2rescore"]["ms2pip_model"] = ms2pip_model if ms2_tolerance is not None: @@ -216,6 +220,13 @@ def filter_out_artifact_psms( type=str, default="Immuno-HCD", ) +@click.option( + "-md", + "--ms2pip_model_dir", + help="The path of MSĀ²PIP model (default: `./`)", + type=str, + default="./" +) @click.option( "-ms2tol", "--ms2_tolerance", @@ -271,6 +282,7 @@ def ms2rescore( fasta_file, test_fdr, feature_generators, + ms2pip_model_dir, ms2pip_model, ms2_tolerance, calibration_set_size, @@ -318,6 +330,7 @@ def ms2rescore( config_file=config_file, output_path=output_path, feature_generators=feature_generators, + ms2pip_model_dir=ms2pip_model_dir, ms2pip_model=ms2pip_model, processes=processes, ms2_tolerance=ms2_tolerance, From 40b01eced090ac14d71489fc283ea2cd497c99a1 Mon Sep 17 00:00:00 2001 From: ypriverol Date: Tue, 30 Jul 2024 14:01:54 +0100 Subject: [PATCH 4/5] minor changes in the main command --- quantmsutils/quantmsutilsc.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/quantmsutils/quantmsutilsc.py b/quantmsutils/quantmsutilsc.py index 2aa94fc..0218150 100644 --- a/quantmsutils/quantmsutilsc.py +++ b/quantmsutils/quantmsutilsc.py @@ -19,16 +19,11 @@ def cli(): cli.add_command(dianncfg) cli.add_command(diann2mztab) - cli.add_command(add_sage_feature) - cli.add_command(mzml_statistics) - cli.add_command(extract_sample_from_expdesign) cli.add_command(check_samplesheet) - cli.add_command(ms2rescore) - cli.add_command(convert_psm) From 066773cee0deba58b61e3e094b5cda6239e17e92 Mon Sep 17 00:00:00 2001 From: ypriverol Date: Tue, 30 Jul 2024 14:16:28 +0100 Subject: [PATCH 5/5] versions added --- .github/workflows/conda-build.yml | 1 + quantmsutils/__init__.py | 1 + quantmsutils/quantmsutilsc.py | 2 ++ 3 files changed, 4 insertions(+) diff --git a/.github/workflows/conda-build.yml b/.github/workflows/conda-build.yml index 0f3dc57..90a5e86 100644 --- a/.github/workflows/conda-build.yml +++ b/.github/workflows/conda-build.yml @@ -47,4 +47,5 @@ jobs: run: | conda activate quantmsutils quantmsutilsc --help + quantmsutilsc --version shell: bash -l {0} \ No newline at end of file diff --git a/quantmsutils/__init__.py b/quantmsutils/__init__.py index e69de29..e344246 100644 --- a/quantmsutils/__init__.py +++ b/quantmsutils/__init__.py @@ -0,0 +1 @@ +__version__ = "0.0.3" \ No newline at end of file diff --git a/quantmsutils/quantmsutilsc.py b/quantmsutils/quantmsutilsc.py index 0218150..835be05 100644 --- a/quantmsutils/quantmsutilsc.py +++ b/quantmsutils/quantmsutilsc.py @@ -8,10 +8,12 @@ from quantmsutils.rescoring.ms2rescore import ms2rescore from quantmsutils.sdrf.check_samplesheet import check_samplesheet from quantmsutils.sdrf.extract_sample import extract_sample_from_expdesign +from quantmsutils import __version__ CONTEXT_SETTINGS = dict(help_option_names=["-h", "--help"]) +@click.version_option(version=__version__, package_name="quantmsutils", message="%(package)s %(version)s") @click.group(context_settings=CONTEXT_SETTINGS) def cli(): pass