diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml index d1a2d67..e4f6fb2 100644 --- a/.github/workflows/python-app.yml +++ b/.github/workflows/python-app.yml @@ -38,3 +38,14 @@ jobs: - name: Test with pytest run: | pytest + - name: Download test data + run: | + wget https://ftp.pride.ebi.ac.uk/pub/databases/pride/resources/proteomes/quantms-ci-github/quantms-utils/TMT_Erwinia_1uLSike_Top10HCD_isol2_45stepped_60min_01.mzML + wget https://ftp.pride.ebi.ac.uk/pub/databases/pride/resources/proteomes/quantms-ci-github/quantms-utils/TMT_Erwinia_1uLSike_Top10HCD_isol2_45stepped_60min_01_comet.idXML + - name: Test percolator ms2rescore + run: | + quantmsutilsc ms2rescore -p TMT_Erwinia_1uLSike_Top10HCD_isol2_45stepped_60min_01_comet.idXML -s TMT_Erwinia_1uLSike_Top10HCD_isol2_45stepped_60min_01.mzML -n 2 -pipm "HCD2021" -fg "ms2pip,deeplc" --id_decoy_pattern "^rev" -t 0.05 +# - name: Test makapot ms2rescore +# run: | +# quantmsutilsc ms2rescore -p TMT_Erwinia_1uLSike_Top10HCD_isol2_45stepped_60min_01_comet.idXML -s TMT_Erwinia_1uLSike_Top10HCD_isol2_45stepped_60min_01.mzML -n 2 -pipm "HCD2021" -fg "ms2pip,deeplc" --id_decoy_pattern "^rev" -t 0.05 --rescoring_engine "mokapot" +# diff --git a/environment.yml b/environment.yml index fdf14f4..4bacf18 100644 --- a/environment.yml +++ b/environment.yml @@ -13,6 +13,7 @@ dependencies: - pydantic - pandas - protobuf>=3.9.2,<4 # Ensure compatibility with tensorboard + - numpy - pip - pip: - tensorboard diff --git a/quantmsutils/diann/diann2mztab.py b/quantmsutils/diann/diann2mztab.py index 14a5fa9..c870e30 100644 --- a/quantmsutils/diann/diann2mztab.py +++ b/quantmsutils/diann/diann2mztab.py @@ -44,14 +44,14 @@ @click.option("--qvalue_threshold", "-q", type=float) @click.pass_context def diann2mztab( - ctx, - folder, - exp_design, - dia_params, - diann_version, - charge, - missed_cleavages, - qvalue_threshold, + ctx, + folder, + exp_design, + dia_params, + diann_version, + charge, + missed_cleavages, + qvalue_threshold, ): """ Convert DIA-NN output to MSstats, Triqler or mzTab. @@ -61,7 +61,7 @@ def diann2mztab( :param folder: DiannConvert specifies the folder where the required file resides. The folder contains the DiaNN main report, protein matrix, precursor matrix, experimental design file, protein sequence FASTA file, version file of DiaNN and ms_info TSVs - ?:param exp_design: Experimental design file + :param exp_design: Experimental design file :param dia_params: A list contains DIA parameters :type dia_params: list :param diann_version: Path to a version file of DIA-NN @@ -228,7 +228,7 @@ def get_exp_design_dfs(exp_design_file): lambda x: _true_stem(x["Spectra_Filepath"]), axis=1 ) - s_table = [i.replace("\n", "").split("\t") for i in data[empty_row + 1 :]][1:] + s_table = [i.replace("\n", "").split("\t") for i in data[empty_row + 1:]][1:] s_header = data[empty_row + 1].replace("\n", "").split("\t") s_data_frame = pd.DataFrame(s_table, columns=s_header) @@ -265,31 +265,31 @@ def compute_mass_modified_peptide(peptide_seq: str) -> float: if aa in aa_mass and not_mod: aa = aa_mass[aa] elif ( - aa - not in [ - "G", - "A", - "V", - "L", - "I", - "F", - "M", - "P", - "W", - "S", - "C", - "T", - "Y", - "N", - "Q", - "D", - "E", - "K", - "R", - "H", - ] - and not_mod - and aa != ")" + aa + not in [ + "G", + "A", + "V", + "L", + "I", + "F", + "M", + "P", + "W", + "S", + "C", + "T", + "Y", + "N", + "Q", + "D", + "E", + "K", + "R", + "H", + ] + and not_mod + and aa != ")" ): logger.info(f"Unknown amino acid with mass not known:{aa}") peptide_parts.append(aa) @@ -367,13 +367,13 @@ def validate_diann_version(self) -> None: raise ValueError(f"Unsupported DIANN version {self.diann_version}") def convert_to_mztab( - self, - report, - f_table, - charge: int, - missed_cleavages: int, - dia_params: List[Any], - out: os.PathLike, + self, + report, + f_table, + charge: int, + missed_cleavages: int, + dia_params: List[Any], + out: Union[os.PathLike, str], ) -> None: logger.info("Converting to mzTab") self.validate_diann_version() @@ -481,8 +481,8 @@ def main_report_df(self, qvalue_threshold: float) -> pd.DataFrame: } mass_vector = report["Modified.Sequence"].map(uniq_masses) report["Calculate.Precursor.Mz"] = ( - mass_vector + (PROTON_MASS_U * report["Precursor.Charge"]) - ) / report["Precursor.Charge"] + mass_vector + (PROTON_MASS_U * report["Precursor.Charge"]) + ) / report["Precursor.Charge"] logger.debug("Indexing Precursors") # Making the map is 1500x faster @@ -589,16 +589,16 @@ def mztab_mtd(index_ref, dia_params, fasta, charge, missed_cleavages): out_mztab_mtd.loc[1, "software[1]-setting[1]"] = fasta out_mztab_mtd.loc[1, "software[1]-setting[2]"] = "db_version:null" out_mztab_mtd.loc[1, "software[1]-setting[3]"] = ( - "fragment_mass_tolerance:" + fragment_mass_tolerance + "fragment_mass_tolerance:" + fragment_mass_tolerance ) out_mztab_mtd.loc[1, "software[1]-setting[4]"] = ( - "fragment_mass_tolerance_unit:" + fragment_mass_tolerance_unit + "fragment_mass_tolerance_unit:" + fragment_mass_tolerance_unit ) out_mztab_mtd.loc[1, "software[1]-setting[5]"] = ( - "precursor_mass_tolerance:" + precursor_mass_tolerance + "precursor_mass_tolerance:" + precursor_mass_tolerance ) out_mztab_mtd.loc[1, "software[1]-setting[6]"] = ( - "precursor_mass_tolerance_unit:" + precursor_mass_tolerance_unit + "precursor_mass_tolerance_unit:" + precursor_mass_tolerance_unit ) out_mztab_mtd.loc[1, "software[1]-setting[7]"] = "enzyme:" + enzyme out_mztab_mtd.loc[1, "software[1]-setting[8]"] = "enzyme_term_specificity:full" @@ -607,10 +607,10 @@ def mztab_mtd(index_ref, dia_params, fasta, charge, missed_cleavages): missed_cleavages ) out_mztab_mtd.loc[1, "software[1]-setting[11]"] = ( - "fixed_modifications:" + fixed_modifications + "fixed_modifications:" + fixed_modifications ) out_mztab_mtd.loc[1, "software[1]-setting[12]"] = ( - "variable_modifications:" + variable_modifications + "variable_modifications:" + variable_modifications ) (fixed_mods, variable_mods, fix_flag, var_flag) = mtd_mod_info( @@ -633,7 +633,7 @@ def mztab_mtd(index_ref, dia_params, fasta, charge, missed_cleavages): ] out_mztab_mtd.loc[1, "variable_mod[" + str(i) + "]-site"] = variable_mods[ i - 1 - ][1] + ][1] out_mztab_mtd.loc[1, "variable_mod[" + str(i) + "]-position"] = "Anywhere" else: out_mztab_mtd.loc[1, "variable_mod[1]"] = variable_mods[0] @@ -649,8 +649,8 @@ def mztab_mtd(index_ref, dia_params, fasta, charge, missed_cleavages): "[MS, MS:1000584, mzML file, ]" ) out_mztab_mtd.loc[1, "ms_run[" + str(i) + "]-location"] = ( - "file://" - + index_ref[index_ref["ms_run"] == i]["Spectra_Filepath"].values[0] + "file://" + + index_ref[index_ref["ms_run"] == i]["Spectra_Filepath"].values[0] ) out_mztab_mtd.loc[1, "ms_run[" + str(i) + "]-id_format"] = ( "[MS, MS:1000777, spectrum identifier nativeID format, ]" @@ -659,7 +659,7 @@ def mztab_mtd(index_ref, dia_params, fasta, charge, missed_cleavages): "[MS, MS:1002038, unlabeled sample, ]" ) out_mztab_mtd.loc[1, "assay[" + str(i) + "]-ms_run_ref"] = ( - "ms_run[" + str(i) + "]" + "ms_run[" + str(i) + "]" ) with warnings.catch_warnings(): @@ -723,9 +723,9 @@ def mztab_prh(report, pg, index_ref, database, fasta_df): col = {} for i in file: col[i] = ( - "protein_abundance_assay[" - + str(index_ref[index_ref["Run"] == _true_stem(i)]["ms_run"].values[0]) - + "]" + "protein_abundance_assay[" + + str(index_ref[index_ref["Run"] == _true_stem(i)]["ms_run"].values[0]) + + "]" ) pg.rename(columns=col, inplace=True) @@ -767,7 +767,7 @@ def mztab_prh(report, pg, index_ref, database, fasta_df): protein_details_df = out_mztab_prh[ out_mztab_prh["opt_global_result_type"] == "indistinguishable_protein_group" - ] + ] prh_series = ( protein_details_df["Protein.Ids"] .str.split(";", expand=True) @@ -884,11 +884,11 @@ def mztab_prh(report, pg, index_ref, database, fasta_df): def mztab_peh( - report: pd.DataFrame, - pr: pd.DataFrame, - precursor_list: List[str], - index_ref: pd.DataFrame, - database: os.PathLike, + report: pd.DataFrame, + pr: pd.DataFrame, + precursor_list: List[str], + index_ref: pd.DataFrame, + database: os.PathLike, ) -> pd.DataFrame: """ Construct PEH sub-table. @@ -1106,8 +1106,8 @@ def __find_info(directory, n): # Standardize spectrum identifier format for bruker data if not isinstance(target.loc[0, "opt_global_spectrum_reference"], str): target.loc[:, "opt_global_spectrum_reference"] = "scan=" + target.loc[ - :, "opt_global_spectrum_reference" - ].astype(str) + :, "opt_global_spectrum_reference" + ].astype(str) # TODO seconds returned from precursor.getRT() target.loc[:, "RT.Start"] = target.apply(lambda x: x["RT.Start"] / 60, axis=1) @@ -1184,7 +1184,7 @@ def __find_info(directory, n): out_mztab_psh.loc[:, "spectra_ref"] = out_mztab_psh.apply( lambda x: "ms_run[{}]:".format(x["ms_run"]) - + x["opt_global_spectrum_reference"], + + x["opt_global_spectrum_reference"], axis=1, result_type="expand", ) @@ -1556,8 +1556,8 @@ def calculate_coverage(ref_sequence: str, sequences: Set[str]): for start, length in sorted(zip(starts, lengths)): if merged_starts and merged_starts[-1] + merged_lengths[-1] >= start: merged_lengths[-1] = ( - max(merged_starts[-1] + merged_lengths[-1], start + length) - - merged_starts[-1] + max(merged_starts[-1] + merged_lengths[-1], start + length) + - merged_starts[-1] ) else: merged_starts.append(start) @@ -1569,7 +1569,7 @@ def calculate_coverage(ref_sequence: str, sequences: Set[str]): def calculate_protein_coverages( - report: pd.DataFrame, out_mztab_prh: pd.DataFrame, fasta_df: pd.DataFrame + report: pd.DataFrame, out_mztab_prh: pd.DataFrame, fasta_df: pd.DataFrame ) -> List[str]: """Calculates protein coverages for the PRH table. diff --git a/quantmsutils/diann/dianncfg.py b/quantmsutils/diann/dianncfg.py index 8f59ebb..7cb6f22 100644 --- a/quantmsutils/diann/dianncfg.py +++ b/quantmsutils/diann/dianncfg.py @@ -52,6 +52,7 @@ def convert_mod(unimod_database, fix_mod: str, var_mod: str) -> Tuple[List, List if fix_mod != "": for mod in fix_mod.split(","): tag = 0 + diann_mod = None for modification in unimod_database.modifications: if modification.get_name() == mod.split(" ")[0]: diann_mod = ( @@ -80,12 +81,18 @@ def convert_mod(unimod_database, fix_mod: str, var_mod: str) -> Tuple[List, List or "mTRAQ" in diann_mod ): fix_ptm.append(diann_mod + "," + site + "," + "label") - else: + elif diann_mod is not None: fix_ptm.append(diann_mod + "," + site) + else: + print( + "Warning: Currently only supported unimod modifications for DIA pipeline. Skipped: " + + mod + ) if var_mod != "": for mod in var_mod.split(","): tag = 0 + diann_mod = None for modification in unimod_database.modifications: if modification.get_name() == mod.split(" ")[0]: diann_mod = ( diff --git a/quantmsutils/mzml/mzml_statistics.py b/quantmsutils/mzml/mzml_statistics.py index 694ba59..ceb51aa 100644 --- a/quantmsutils/mzml/mzml_statistics.py +++ b/quantmsutils/mzml/mzml_statistics.py @@ -22,6 +22,7 @@ def mzml_statistics(ctx, ms_path: str, id_only: bool = False) -> None: # Command line usage example python script_name.py mzml_statistics --ms_path "path/to/file.mzML" + :param ctx: Click context :param ms_path: A string specifying the path to the mass spectrometry file. :param id_only: A boolean flag that, when set to True, generates a CSV file containing only the spectrum ID and peaks data for MS level 2 spectra. diff --git a/quantmsutils/quantmsutilsc.py b/quantmsutils/quantmsutilsc.py index 54df3a6..732db25 100644 --- a/quantmsutils/quantmsutilsc.py +++ b/quantmsutils/quantmsutilsc.py @@ -31,5 +31,11 @@ def cli(): cli.add_command(convert_psm) +def main(): + try: + cli() + except SystemExit as e: + if e.code != 0: + raise if __name__ == "__main__": - cli() + main() diff --git a/quantmsutils/rescoring/ms2rescore.py b/quantmsutils/rescoring/ms2rescore.py index cc7028e..c7a56b8 100644 --- a/quantmsutils/rescoring/ms2rescore.py +++ b/quantmsutils/rescoring/ms2rescore.py @@ -83,7 +83,7 @@ def parse_cli_arguments_to_config(config_file: str = None, feature_generators: s if lower_score_is_better is not None: config["ms2rescore"]["lower_score_is_better"] = lower_score_is_better if processes is None: - processes = 1 # Default to single process + processes = 1 # Default to single process config["ms2rescore"]["processes"] = processes if output_path is not None: config["ms2rescore"]["output_path"] = output_path @@ -111,7 +111,7 @@ def rescore_idxml(input_file, output_file, config) -> None: def filter_out_artifact_psms( - psm_list: PSMList, peptide_ids: List[oms.PeptideIdentification] + psm_list: PSMList, peptide_ids: List[oms.PeptideIdentification] ) -> List[oms.PeptideIdentification]: """Filter out PeptideHits that could not be processed by all feature generators""" num_mandatory_features = max([len(psm.rescoring_features) for psm in psm_list]) @@ -222,8 +222,8 @@ def filter_out_artifact_psms( @click.option( "-re", "--rescoring_engine", - help="Either mokapot or percolator (default: `mokapot`)", - default="mokapot", + help="Either mokapot or percolator (default: `percolator`)", + default="percolator", type=click.Choice(["mokapot", "percolator"]), ) @click.option( @@ -252,23 +252,23 @@ def filter_out_artifact_psms( ) @click.pass_context def ms2rescore( - ctx, - psm_file: str, - spectrum_path, - output_path: str, - log_level, - processes, - fasta_file, - test_fdr, - feature_generators, - ms2pip_model, - ms2_tolerance, - calibration_set_size, - rescoring_engine, - rng, - id_decoy_pattern, - lower_score_is_better, - config_file: str, + ctx, + psm_file: str, + spectrum_path, + output_path: str, + log_level, + processes, + fasta_file, + test_fdr, + feature_generators, + ms2pip_model, + ms2_tolerance, + calibration_set_size, + rescoring_engine, + rng, + id_decoy_pattern, + lower_score_is_better, + config_file: str, ): """ Rescore PSMs in an idXML file and keep other information unchanged. @@ -296,6 +296,10 @@ def ms2rescore( if output_path is None: output_path = psm_file.replace(".idXML", "_ms2rescore.idXML") + if rescoring_engine == "moakapot": + logging.warning("Mokapot rescoring engine is not supported in this version. Please use Percolator.") + raise ValueError("Mokapot rescoring engine is not supported in this version. Please use Percolator.") + config = parse_cli_arguments_to_config( config_file=config_file, output_path=output_path, diff --git a/quantmsutils/sdrf/check_samplesheet.py b/quantmsutils/sdrf/check_samplesheet.py index f1c50e5..79f7c16 100644 --- a/quantmsutils/sdrf/check_samplesheet.py +++ b/quantmsutils/sdrf/check_samplesheet.py @@ -66,7 +66,7 @@ def check_expdesign(expdesign): ) sys.exit(1) - s_table = [i.replace("\n", "").split("\t") for i in lines[empty_row + 1 :]][1:] + s_table = [i.replace("\n", "").split("\t") for i in lines[empty_row + 1:]][1:] s_header = lines[empty_row + 1].replace("\n", "").split("\t") s_data_frame = pd.DataFrame(s_table, columns=s_header) diff --git a/quantmsutils/sdrf/extract_sample.py b/quantmsutils/sdrf/extract_sample.py index 0d0e7ff..75f0a51 100644 --- a/quantmsutils/sdrf/extract_sample.py +++ b/quantmsutils/sdrf/extract_sample.py @@ -25,7 +25,7 @@ def extract_sample_from_expdesign(cxt, expdesign: str) -> None: with open(expdesign, "r") as f: lines = f.readlines() empty_row = lines.index("\n") - s_table = [i.replace("\n", "").split("\t") for i in lines[empty_row + 1 :]][1:] + s_table = [i.replace("\n", "").split("\t") for i in lines[empty_row + 1:]][1:] s_header = lines[empty_row + 1].replace("\n", "").split("\t") s_data_frame = pd.DataFrame(s_table, columns=s_header) diff --git a/requirements.txt b/requirements.txt index 524f1cb..5b985c4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,4 +4,5 @@ pyopenms ms2rescore==3.0.2 psm-utils==0.8.0 pydantic -pandas \ No newline at end of file +pandas +numpy \ No newline at end of file diff --git a/setup.py b/setup.py index c0b2a44..b0da1a2 100644 --- a/setup.py +++ b/setup.py @@ -36,6 +36,7 @@ "psm-utils==0.8.0", "pydantic", "pandas", + "numpy", ] PYTHON_REQUIRES = ">=3.8,<4"