From c666cc431043315f46888f39ec43b20387870b80 Mon Sep 17 00:00:00 2001 From: Henry Webel Date: Sat, 25 Jan 2025 18:23:08 +0100 Subject: [PATCH 1/8] :white_check_mark: start reading the data - downstream ion parsing does not work yet --- proteobench/io/parsing/parse_ion.py | 14 ++- ..._quantms.sdrf_openms_design_msstats_in.csv | 101 ++++++++++++++++++ test/test_module_dda_quant.py | 1 + 3 files changed, 114 insertions(+), 2 deletions(-) create mode 100644 test/data/dda_quant/sample_dda_quantms.sdrf_openms_design_msstats_in.csv diff --git a/proteobench/io/parsing/parse_ion.py b/proteobench/io/parsing/parse_ion.py index 82a1a2f7..a4f71fe6 100644 --- a/proteobench/io/parsing/parse_ion.py +++ b/proteobench/io/parsing/parse_ion.py @@ -1,7 +1,6 @@ import math import os import re -from typing import Dict, List, Optional import pandas as pd @@ -109,7 +108,18 @@ def load_input_file(input_csv: str, input_format: str) -> pd.DataFrame: input_data_frame = pd.read_csv(input_csv, low_memory=False, sep="\t") elif input_format == "PEAKS": input_data_frame = pd.read_csv(input_csv, low_memory=False, sep=",") - + elif input_format == "quantms": + input_data_frame = pd.read_csv(input_csv, low_memory=False) + input_data_frame = input_data_frame.assign( + Sequence=input_data_frame["PeptideSequence"].str.replace( + r"\(([^)]+)\)", + r"", + regex=True, + ), + ) + input_data_frame["Sequence"] = input_data_frame["PeptideSequence"].str.replace(r"\(([^)]+)\)", r"", regex=True) + else: + raise ValueError(f"Input format '{input_format}' not recognized.") return input_data_frame diff --git a/test/data/dda_quant/sample_dda_quantms.sdrf_openms_design_msstats_in.csv b/test/data/dda_quant/sample_dda_quantms.sdrf_openms_design_msstats_in.csv new file mode 100644 index 00000000..ee6ce861 --- /dev/null +++ b/test/data/dda_quant/sample_dda_quantms.sdrf_openms_design_msstats_in.csv @@ -0,0 +1,101 @@ +ProteinName,PeptideSequence,PrecursorCharge,FragmentIon,ProductCharge,IsotopeLabelType,Condition,BioReplicate,Run,Intensity,Reference +sp|P09733|TBA1_YEAST,DLFHPEQLISGK,3,,0,L,cond_A,1,1,131708800.0,LFQ_Orbitrap_DDA_Condition_A_Sample_Alpha_01.mzML +sp|P32119|PRDX2_HUMAN,TDEGIAYR,2,,0,L,cond_B,4,4,845348600.0,LFQ_Orbitrap_DDA_Condition_B_Sample_Alpha_01.mzML +sp|P49321|NASP_HUMAN,EQVYDAMGEK,2,,0,L,cond_B,4,4,48666640.0,LFQ_Orbitrap_DDA_Condition_B_Sample_Alpha_01.mzML +sp|P50990|TCPQ_HUMAN,TVGATALPR,2,,0,L,cond_A,1,1,958554700.0,LFQ_Orbitrap_DDA_Condition_A_Sample_Alpha_01.mzML +sp|Q99832|TCPH_HUMAN,SLHDAIMIVR,3,,0,L,cond_B,4,4,329470100.0,LFQ_Orbitrap_DDA_Condition_B_Sample_Alpha_01.mzML +sp|P58004|SESN2_HUMAN,GPSAFIPVEEVLR,2,,0,L,cond_B,5,5,29033340.0,LFQ_Orbitrap_DDA_Condition_B_Sample_Alpha_02.mzML +sp|P02545|LMNA_HUMAN,ASASGSGAQVGGPISSGSSASSVTVTR,3,,0,L,cond_B,4,4,97719470.0,LFQ_Orbitrap_DDA_Condition_B_Sample_Alpha_01.mzML +sp|Q8WWM7|ATX2L_HUMAN,FTDSAIAMNSK,2,,0,L,cond_B,6,6,47579640.0,LFQ_Orbitrap_DDA_Condition_B_Sample_Alpha_03.mzML +sp|P50552|VASP_HUMAN,QQPGPSEHIER,3,,0,L,cond_B,4,4,118654400.0,LFQ_Orbitrap_DDA_Condition_B_Sample_Alpha_01.mzML +sp|Q7L014|DDX46_HUMAN,ASYPC(Carbamidomethyl)MSLHGGIDQYDR,3,,0,L,cond_B,4,4,38434830.0,LFQ_Orbitrap_DDA_Condition_B_Sample_Alpha_01.mzML +sp|P30154|2AAB_HUMAN,MAGDQVANVR,2,,0,L,cond_A,1,1,47908580.0,LFQ_Orbitrap_DDA_Condition_A_Sample_Alpha_01.mzML +sp|Q04726|TLE3_HUMAN,AELTSSAPAC(Carbamidomethyl)YALAISPDAK,2,,0,L,cond_A,2,2,22015490.0,LFQ_Orbitrap_DDA_Condition_A_Sample_Alpha_02.mzML +sp|P35658|NU214_HUMAN,SSLLAVSNK,2,,0,L,cond_A,1,1,91677080.0,LFQ_Orbitrap_DDA_Condition_A_Sample_Alpha_01.mzML +sp|Q8WUM4|PDC6I_HUMAN,LLDEEEATDNDLR,2,,0,L,cond_A,1,1,245312700.0,LFQ_Orbitrap_DDA_Condition_A_Sample_Alpha_01.mzML +sp|P38910|CH10_YEAST,LNQAEVVAVGPGFTDANGNK,3,,0,L,cond_B,4,4,81358820.0,LFQ_Orbitrap_DDA_Condition_B_Sample_Alpha_01.mzML +sp|Q9UQR0|SCML2_HUMAN,SEAPSYIAVPDPSVLK,2,,0,L,cond_A,2,2,43908620.0,LFQ_Orbitrap_DDA_Condition_A_Sample_Alpha_02.mzML +sp|O43390|HNRPR_HUMAN,LC(Carbamidomethyl)DSYEIRPGK,2,,0,L,cond_A,1,1,50041590.0,LFQ_Orbitrap_DDA_Condition_A_Sample_Alpha_01.mzML +sp|P05373|HEM2_YEAST,GLINANLAHK,2,,0,L,cond_B,5,5,32199620.0,LFQ_Orbitrap_DDA_Condition_B_Sample_Alpha_02.mzML +sp|Q04438|SPG4_YEAST,NVDISNMSQGEFLR,2,,0,L,cond_B,4,4,70438430.0,LFQ_Orbitrap_DDA_Condition_B_Sample_Alpha_01.mzML +sp|Q9NUQ3|TXLNG_HUMAN,ALGAHLEAEPK,3,,0,L,cond_A,1,1,83372190.0,LFQ_Orbitrap_DDA_Condition_A_Sample_Alpha_01.mzML +sp|P49411|EFTU_HUMAN,NMITGTAPLDGC(Carbamidomethyl)ILVVAANDGPMPQTR,3,,0,L,cond_A,1,1,128703300.0,LFQ_Orbitrap_DDA_Condition_A_Sample_Alpha_01.mzML +sp|Q14657|LAGE3_HUMAN,FGPPVSR,2,,0,L,cond_B,6,6,57360970.0,LFQ_Orbitrap_DDA_Condition_B_Sample_Alpha_03.mzML +sp|P38426|TPS3_YEAST,NPNLSFDSHPPR,3,,0,L,cond_A,1,1,61451870.0,LFQ_Orbitrap_DDA_Condition_A_Sample_Alpha_01.mzML +sp|P21333|FLNA_HUMAN,SPFEVYVDK,2,,0,L,cond_B,5,5,297051000.0,LFQ_Orbitrap_DDA_Condition_B_Sample_Alpha_02.mzML +sp|Q14684|RRP1B_HUMAN,AGPGSLELC(Carbamidomethyl)GLPSQK,2,,0,L,cond_B,6,6,60428020.0,LFQ_Orbitrap_DDA_Condition_B_Sample_Alpha_03.mzML +sp|Q07666|KHDR1_HUMAN,ILGPQGNTIK,2,,0,L,cond_A,1,1,507243000.0,LFQ_Orbitrap_DDA_Condition_A_Sample_Alpha_01.mzML +sp|P48735|IDHP_HUMAN,VC(Carbamidomethyl)VETVESGAMTK,2,,0,L,cond_B,4,4,49347380.0,LFQ_Orbitrap_DDA_Condition_B_Sample_Alpha_01.mzML +sp|Q15645|PCH2_HUMAN,IDVAFVDR,2,,0,L,cond_A,2,2,70441630.0,LFQ_Orbitrap_DDA_Condition_A_Sample_Alpha_02.mzML +sp|P00558|PGK1_HUMAN,AC(Carbamidomethyl)ANPAAGSVILLENLR,3,,0,L,cond_A,3,3,639706700.0,LFQ_Orbitrap_DDA_Condition_A_Sample_Alpha_03.mzML +sp|P46367|ALDH4_YEAST,HIYQSAAAGLK,2,,0,L,cond_A,2,2,190671900.0,LFQ_Orbitrap_DDA_Condition_A_Sample_Alpha_02.mzML +sp|Q5T653|RM02_HUMAN,QMQVLETC(Carbamidomethyl)VATVGR,2,,0,L,cond_A,2,2,38343660.0,LFQ_Orbitrap_DDA_Condition_A_Sample_Alpha_02.mzML +sp|Q9NWV4|CZIB_HUMAN,TIVEFEC(Carbamidomethyl)R,2,,0,L,cond_A,1,1,82191550.0,LFQ_Orbitrap_DDA_Condition_A_Sample_Alpha_01.mzML +sp|P09972|ALDOC_HUMAN,YTPEEIAMATVTALR,2,,0,L,cond_B,6,6,351006400.0,LFQ_Orbitrap_DDA_Condition_B_Sample_Alpha_03.mzML +sp|P42696|RBM34_HUMAN,IQINQEEER,2,,0,L,cond_B,5,5,17101070.0,LFQ_Orbitrap_DDA_Condition_B_Sample_Alpha_02.mzML +sp|Q8WUM4|PDC6I_HUMAN,FYNELTEILVR,2,,0,L,cond_A,2,2,220731500.0,LFQ_Orbitrap_DDA_Condition_A_Sample_Alpha_02.mzML +sp|Q15833|STXB2_HUMAN,EPIPSLEAIYLLSPTEK,2,,0,L,cond_A,3,3,26323180.0,LFQ_Orbitrap_DDA_Condition_A_Sample_Alpha_03.mzML +sp|Q9BWS9|CHID1_HUMAN,GLVVTDLK,2,,0,L,cond_A,3,3,25301540.0,LFQ_Orbitrap_DDA_Condition_A_Sample_Alpha_03.mzML +sp|Q02486|ABF2_YEAST,LYSEYQK,2,,0,L,cond_B,5,5,48212330.0,LFQ_Orbitrap_DDA_Condition_B_Sample_Alpha_02.mzML +sp|Cont_Q3SX14|GELS_BOVIN,AQPVQVAEGSEPDSFWEALGGK,2,,0,L,cond_A,1,1,29316550.0,LFQ_Orbitrap_DDA_Condition_A_Sample_Alpha_01.mzML +sp|Q92841|DDX17_HUMAN,FVINYDYPNSSEDYVHR,3,,0,L,cond_B,4,4,331407500.0,LFQ_Orbitrap_DDA_Condition_B_Sample_Alpha_01.mzML +sp|Q13428|TCOF_HUMAN,LGAGEGGEASVSPEK,2,,0,L,cond_B,6,6,37681520.0,LFQ_Orbitrap_DDA_Condition_B_Sample_Alpha_03.mzML +sp|P11177|ODPB_HUMAN,DAINQGMDEELERDEK,3,,0,L,cond_A,2,2,69755510.0,LFQ_Orbitrap_DDA_Condition_A_Sample_Alpha_02.mzML +sp|P08758|ANXA5_HUMAN,ALLLLC(Carbamidomethyl)GEDD,2,,0,L,cond_A,1,1,41767580.0,LFQ_Orbitrap_DDA_Condition_A_Sample_Alpha_01.mzML +sp|Q9UHD8|SEPT9_HUMAN,FINDQYEK,2,,0,L,cond_A,2,2,93834380.0,LFQ_Orbitrap_DDA_Condition_A_Sample_Alpha_02.mzML +sp|Q9UNZ2|NSF1C_HUMAN,SPNELVDDLFK,2,,0,L,cond_A,2,2,38462320.0,LFQ_Orbitrap_DDA_Condition_A_Sample_Alpha_02.mzML +sp|Q1KMD3|HNRL2_HUMAN,NGEDLGVAFWISK,2,,0,L,cond_B,4,4,124289300.0,LFQ_Orbitrap_DDA_Condition_B_Sample_Alpha_01.mzML +sp|P16157|ANK1_HUMAN,LGYISVTDVLK,2,,0,L,cond_B,4,4,78834210.0,LFQ_Orbitrap_DDA_Condition_B_Sample_Alpha_01.mzML +sp|P30050|RL12_HUMAN,QAQIEVVPSASALIIK,2,,0,L,cond_A,1,1,610790300.0,LFQ_Orbitrap_DDA_Condition_A_Sample_Alpha_01.mzML +sp|P15880|RS2_HUMAN,LSIVPVR,2,,0,L,cond_B,4,4,618046500.0,LFQ_Orbitrap_DDA_Condition_B_Sample_Alpha_01.mzML +sp|P27616|PUR7_YEAST,TELDGILPLVAR,2,,0,L,cond_A,3,3,435745100.0,LFQ_Orbitrap_DDA_Condition_A_Sample_Alpha_03.mzML +sp|P26641|EF1G_HUMAN,STFVLDEFK,2,,0,L,cond_B,4,4,1932950000.0,LFQ_Orbitrap_DDA_Condition_B_Sample_Alpha_01.mzML +sp|O15498|YKT6_HUMAN,VAFTLLEK,2,,0,L,cond_B,5,5,38799580.0,LFQ_Orbitrap_DDA_Condition_B_Sample_Alpha_02.mzML +sp|P38088|SYG_YEAST,YDIGNPVTGETLESPR,2,,0,L,cond_B,4,4,186166200.0,LFQ_Orbitrap_DDA_Condition_B_Sample_Alpha_01.mzML +sp|P49589|SYCC_HUMAN,VSEYVPEIVNFVQK,3,,0,L,cond_B,5,5,26276280.0,LFQ_Orbitrap_DDA_Condition_B_Sample_Alpha_02.mzML +sp|P38764|RPN1_YEAST,VGQAVETVGQAGRPK,2,,0,L,cond_A,3,3,22935790.0,LFQ_Orbitrap_DDA_Condition_A_Sample_Alpha_03.mzML +sp|P28482|MK01_HUMAN,VADPDHDHTGFLTEYVATR,4,,0,L,cond_B,4,4,230327400.0,LFQ_Orbitrap_DDA_Condition_B_Sample_Alpha_01.mzML +sp|P23381|SYWC_HUMAN,MSASDPNSSIFLTDTAK,2,,0,L,cond_A,3,3,346003800.0,LFQ_Orbitrap_DDA_Condition_A_Sample_Alpha_03.mzML +sp|P38701|RS20_YEAST,YIDLEAPVQIVK,3,,0,L,cond_A,1,1,49868200.0,LFQ_Orbitrap_DDA_Condition_A_Sample_Alpha_01.mzML +sp|P16120|THRC_YEAST,ADVELVK,2,,0,L,cond_B,6,6,3600227000.0,LFQ_Orbitrap_DDA_Condition_B_Sample_Alpha_03.mzML +sp|P09110|THIK_HUMAN,QDTFALASQQK,2,,0,L,cond_A,3,3,84843690.0,LFQ_Orbitrap_DDA_Condition_A_Sample_Alpha_03.mzML +sp|P30044|PRDX5_HUMAN,THLPGFVEQAEALK,3,,0,L,cond_B,5,5,286970800.0,LFQ_Orbitrap_DDA_Condition_B_Sample_Alpha_02.mzML +sp|P35197|GCS1_YEAST,NFNGNAEDSSTAGNTTHTEYQK,3,,0,L,cond_A,1,1,30576350.0,LFQ_Orbitrap_DDA_Condition_A_Sample_Alpha_01.mzML +sp|Q99567|NUP88_HUMAN,NQSPTEAEKPASSSLPSSPPPQLLTR,3,,0,L,cond_B,6,6,122309400.0,LFQ_Orbitrap_DDA_Condition_B_Sample_Alpha_03.mzML +sp|P0A6G7|CLPP_ECOLI,FLSAPEAVEYGLVDSILTHR,3,,0,L,cond_A,3,3,97005950.0,LFQ_Orbitrap_DDA_Condition_A_Sample_Alpha_03.mzML +sp|Q14690|RRP5_HUMAN,VVILNVDLLK,2,,0,L,cond_B,5,5,15405070.0,LFQ_Orbitrap_DDA_Condition_B_Sample_Alpha_02.mzML +sp|P0A6P1|EFTS_ECOLI,VAALEGDVLGSYQHGAR,2,,0,L,cond_B,5,5,68910670.0,LFQ_Orbitrap_DDA_Condition_B_Sample_Alpha_02.mzML +sp|O43583|DENR_HUMAN,QEAGISEGQGTAGEEEEK,2,,0,L,cond_B,6,6,74275150.0,LFQ_Orbitrap_DDA_Condition_B_Sample_Alpha_03.mzML +sp|Q6PJ69|TRI65_HUMAN,ASLEVTQQQATQAEGQLLELR,3,,0,L,cond_A,2,2,14652530.0,LFQ_Orbitrap_DDA_Condition_A_Sample_Alpha_02.mzML +sp|O15397|IPO8_HUMAN,ETENDDVTNVIQK,2,,0,L,cond_A,3,3,41540000.0,LFQ_Orbitrap_DDA_Condition_A_Sample_Alpha_03.mzML +sp|P0A9G6|ACEA_ECOLI,ADQIQWSAGIEPGDPR,2,,0,L,cond_B,5,5,106234000.0,LFQ_Orbitrap_DDA_Condition_B_Sample_Alpha_02.mzML +sp|Q15393|SF3B3_HUMAN,MQGQEAVLAMSSR,2,,0,L,cond_B,6,6,285025400.0,LFQ_Orbitrap_DDA_Condition_B_Sample_Alpha_03.mzML +sp|P07814|SYEP_HUMAN,EAPC(Carbamidomethyl)VLIYIPDGHTK,2,,0,L,cond_A,3,3,49156330.0,LFQ_Orbitrap_DDA_Condition_A_Sample_Alpha_03.mzML +sp|Q9H9A7|RMI1_HUMAN,DLEHPLLPDGILEIPK,3,,0,L,cond_B,4,4,13267190.0,LFQ_Orbitrap_DDA_Condition_B_Sample_Alpha_01.mzML +sp|O94880|PHF14_HUMAN,LNIPAILR,2,,0,L,cond_B,6,6,18349690.0,LFQ_Orbitrap_DDA_Condition_B_Sample_Alpha_03.mzML +sp|Q08J23|NSUN2_HUMAN,LAQEGIYTLYPFINSR,3,,0,L,cond_B,5,5,41253360.0,LFQ_Orbitrap_DDA_Condition_B_Sample_Alpha_02.mzML +sp|P32929|CGL_HUMAN,VIYPGLPSHPQHELVK,3,,0,L,cond_B,6,6,96001410.0,LFQ_Orbitrap_DDA_Condition_B_Sample_Alpha_03.mzML +sp|P0AGE9|SUCD_ECOLI,SGTLTYEAVK,2,,0,L,cond_B,4,4,131380700.0,LFQ_Orbitrap_DDA_Condition_B_Sample_Alpha_01.mzML +sp|O15355|PPM1G_HUMAN,GTEAGQVGEPGIPTGEAGPSC(Carbamidomethyl)SSASDK,2,,0,L,cond_A,2,2,32767620.0,LFQ_Orbitrap_DDA_Condition_A_Sample_Alpha_02.mzML +sp|Q99614|TTC1_HUMAN,SNEDVNSSELDEEYLIELEK,2,,0,L,cond_B,4,4,44210590.0,LFQ_Orbitrap_DDA_Condition_B_Sample_Alpha_01.mzML +sp|P62316|SMD2_HUMAN,REEEEFNTGPLSVLTQSVK,2,,0,L,cond_A,3,3,216058000.0,LFQ_Orbitrap_DDA_Condition_A_Sample_Alpha_03.mzML +sp|O14828|SCAM3_HUMAN,ELQHAALGGTATR,2,,0,L,cond_B,4,4,8710374.0,LFQ_Orbitrap_DDA_Condition_B_Sample_Alpha_01.mzML +sp|Q9BX10|GTPB2_HUMAN,VGADITVLR,2,,0,L,cond_B,5,5,30394810.0,LFQ_Orbitrap_DDA_Condition_B_Sample_Alpha_02.mzML +sp|P31539|HS104_YEAST,YAIDMTEQAR,2,,0,L,cond_A,2,2,309601800.0,LFQ_Orbitrap_DDA_Condition_A_Sample_Alpha_02.mzML +sp|Q08723|RPN8_YEAST,VTNSFALPFEEDEK,2,,0,L,cond_A,1,1,53689080.0,LFQ_Orbitrap_DDA_Condition_A_Sample_Alpha_01.mzML +sp|P53250|TWF1_YEAST,SFEELVQLASQER,2,,0,L,cond_B,6,6,17220740.0,LFQ_Orbitrap_DDA_Condition_B_Sample_Alpha_03.mzML +sp|P14922|CYC8_YEAST,ANEIYFR,2,,0,L,cond_A,1,1,19421460.0,LFQ_Orbitrap_DDA_Condition_A_Sample_Alpha_01.mzML +sp|Q9UJV9|DDX41_HUMAN,GVEAVAIHGGK,2,,0,L,cond_B,4,4,12814100.0,LFQ_Orbitrap_DDA_Condition_B_Sample_Alpha_01.mzML +sp|Q99856|ARI3A_HUMAN,MALVADEQQR,2,,0,L,cond_A,1,1,207930200.0,LFQ_Orbitrap_DDA_Condition_A_Sample_Alpha_01.mzML +sp|Q9Y613|FHOD1_HUMAN,FSGVAGEAPSNPSVPVAVSSGPGR,2,,0,L,cond_B,4,4,14475940.0,LFQ_Orbitrap_DDA_Condition_B_Sample_Alpha_01.mzML +sp|P46778|RL21_HUMAN,VYNVTQHAVGIVVNK,3,,0,L,cond_A,1,1,817014100.0,LFQ_Orbitrap_DDA_Condition_A_Sample_Alpha_01.mzML +sp|P31939|PUR9_HUMAN,NLTALGLNLVASGGTAK,2,,0,L,cond_B,6,6,409515000.0,LFQ_Orbitrap_DDA_Condition_B_Sample_Alpha_03.mzML +sp|P09105|HBAT_HUMAN,LGSNVGVYTTEALER,2,,0,L,cond_A,2,2,47899360.0,LFQ_Orbitrap_DDA_Condition_A_Sample_Alpha_02.mzML +sp|Q96T37|RBM15_HUMAN,SSGAASSAPGGGDGAEYK,2,,0,L,cond_B,5,5,20204150.0,LFQ_Orbitrap_DDA_Condition_B_Sample_Alpha_02.mzML +sp|Q96G03|PGM2_HUMAN,MNDLTIIQTTQGFC(Carbamidomethyl)R,2,,0,L,cond_B,4,4,44245960.0,LFQ_Orbitrap_DDA_Condition_B_Sample_Alpha_01.mzML +sp|Q9BXP5|SRRT_HUMAN,EEEWEKPK,2,,0,L,cond_B,4,4,25720120.0,LFQ_Orbitrap_DDA_Condition_B_Sample_Alpha_01.mzML +sp|P42704|LPPRC_HUMAN,MEEANIQPNR,2,,0,L,cond_B,4,4,104089900.0,LFQ_Orbitrap_DDA_Condition_B_Sample_Alpha_01.mzML +sp|Q86V81|THOC4_HUMAN,SLGTADVHFER,3,,0,L,cond_B,4,4,527266100.0,LFQ_Orbitrap_DDA_Condition_B_Sample_Alpha_01.mzML +sp|P22336|RFA1_YEAST,EEDPNEFTK,2,,0,L,cond_A,1,1,20258070.0,LFQ_Orbitrap_DDA_Condition_A_Sample_Alpha_01.mzML +sp|Cont_Q7SIH1|A2MG_BOVIN,NEESLVFVQTDKPIYKPEQTVK,4,,0,L,cond_A,2,2,120535100.0,LFQ_Orbitrap_DDA_Condition_A_Sample_Alpha_02.mzML +sp|Q8WUM4|PDC6I_HUMAN,DTIVLLC(Carbamidomethyl)KPEPELNAAIPSANPAK,2,,0,L,cond_B,5,5,27655920.0,LFQ_Orbitrap_DDA_Condition_B_Sample_Alpha_02.mzML diff --git a/test/test_module_dda_quant.py b/test/test_module_dda_quant.py index adbd4c7b..74f7fb2d 100644 --- a/test/test_module_dda_quant.py +++ b/test/test_module_dda_quant.py @@ -28,6 +28,7 @@ "ProlineStudio": os.path.join(TESTDATA_DIR, "Proline_DDA_quan_ions_subset.xlsx"), "MSAngel": os.path.join(TESTDATA_DIR, "MSAngel_DDA_quan_ions_subset.xlsx"), "i2MassChroQ": os.path.join(TESTDATA_DIR, "i2MassChroQ_DDA_quant_ions_test_new_random_subset.tsv"), + "quantms": os.path.join(TESTDATA_DIR, "sample_dda_quantms.sdrf_openms_design_msstats_in.csv"), } From e280fc65f05babc55cdd2963a674bd159165915e Mon Sep 17 00:00:00 2001 From: Henry Webel Date: Sat, 25 Jan 2025 18:35:16 +0100 Subject: [PATCH 2/8] :wrench::construction: add not entirely correct configuration --- .../lfq/ion/DDA/parse_settings_msstats.toml | 33 +++++++++++++++++++ .../parse_settings_files.toml | 1 + 2 files changed, 34 insertions(+) create mode 100644 proteobench/io/parsing/io_parse_settings/Quant/lfq/ion/DDA/parse_settings_msstats.toml diff --git a/proteobench/io/parsing/io_parse_settings/Quant/lfq/ion/DDA/parse_settings_msstats.toml b/proteobench/io/parsing/io_parse_settings/Quant/lfq/ion/DDA/parse_settings_msstats.toml new file mode 100644 index 00000000..10a1cfa7 --- /dev/null +++ b/proteobench/io/parsing/io_parse_settings/Quant/lfq/ion/DDA/parse_settings_msstats.toml @@ -0,0 +1,33 @@ +[mapper] +"ProteinName" = "Proteins" +"Sequence" = "Sequence" +"PrecursorCharge" = "Charge" +"Reference" = "Raw file" +"PeptideSequence" = "Modified sequence" + +[condition_mapper] +"LFQ_Orbitrap_DDA_Condition_A_Sample_Alpha_01.mzML" = "A" +"LFQ_Orbitrap_DDA_Condition_A_Sample_Alpha_02.mzML" = "A" +"LFQ_Orbitrap_DDA_Condition_A_Sample_Alpha_03.mzML" = "A" +"LFQ_Orbitrap_DDA_Condition_B_Sample_Alpha_01.mzML" = "B" +"LFQ_Orbitrap_DDA_Condition_B_Sample_Alpha_02.mzML" = "B" +"LFQ_Orbitrap_DDA_Condition_B_Sample_Alpha_03.mzML" = "B" + +[run_mapper] +"LFQ_Orbitrap_DDA_Condition_A_Sample_Alpha_01.mzML" = "Condition_A_Sample_Alpha_01" +"LFQ_Orbitrap_DDA_Condition_A_Sample_Alpha_02.mzML" = "Condition_A_Sample_Alpha_02" +"LFQ_Orbitrap_DDA_Condition_A_Sample_Alpha_03.mzML" = "Condition_A_Sample_Alpha_03" +"LFQ_Orbitrap_DDA_Condition_B_Sample_Alpha_01.mzML" = "Condition_B_Sample_Alpha_01" +"LFQ_Orbitrap_DDA_Condition_B_Sample_Alpha_02.mzML" = "Condition_B_Sample_Alpha_02" +"LFQ_Orbitrap_DDA_Condition_B_Sample_Alpha_03.mzML" = "Condition_B_Sample_Alpha_03" + + +[species_mapper] +"_YEAST" = "YEAST" +"_ECOLI" = "ECOLI" +"_HUMAN" = "HUMAN" + + +[general] +"contaminant_flag" = "Cont_" +"decoy_flag" = false diff --git a/proteobench/io/parsing/io_parse_settings/parse_settings_files.toml b/proteobench/io/parsing/io_parse_settings/parse_settings_files.toml index 2b25755a..c6b9bcee 100644 --- a/proteobench/io/parsing/io_parse_settings/parse_settings_files.toml +++ b/proteobench/io/parsing/io_parse_settings/parse_settings_files.toml @@ -7,6 +7,7 @@ "MSAngel" = "parse_settings_msangel.toml" "Sage" = "parse_settings_sage.toml" "PEAKS" = "parse_settings_peaks.toml" +"quantms" = "parse_settings_msstats.toml" "Custom" = "parse_settings_custom.toml" [quant_lfq_peptidoform_DDA] From 9d0ddc08d7cf5a537113fdd59d3354e344b986e0 Mon Sep 17 00:00:00 2001 From: Henry Webel Date: Sun, 26 Jan 2025 16:45:55 +0100 Subject: [PATCH 3/8] :art: raise errors if proforma is missing and explicit error handling --- proteobench/io/parsing/parse_settings.py | 8 +++---- .../quant/lfq/ion/DDA/quant_lfq_ion_DDA.py | 23 +++++++++++-------- 2 files changed, 17 insertions(+), 14 deletions(-) diff --git a/proteobench/io/parsing/parse_settings.py b/proteobench/io/parsing/parse_settings.py index f45377e6..1e99dc3c 100644 --- a/proteobench/io/parsing/parse_settings.py +++ b/proteobench/io/parsing/parse_settings.py @@ -171,14 +171,14 @@ def convert_to_standard_format(self, df: pd.DataFrame) -> tuple[pd.DataFrame, Di df_filtered_melted["proforma"] + "|Z=" + df_filtered_melted["Charge"].astype(str) ) else: - print("Not all columns required for making the ion are available.") + raise ValueError("Not all columns required for making the ion are available: 'proforma' and 'Charge'.") return df_filtered_melted, replicate_to_raw elif self.analysis_level == "peptidoform": if "proforma" in df_filtered_melted.columns: df_filtered_melted["peptidoform"] = df_filtered_melted["proforma"] else: - print("Not all columns required for making the peptidoform are available.") + raise ValueError("Not all columns required for making the peptidoform are available: 'proforma'.") return df_filtered_melted, replicate_to_raw else: @@ -244,11 +244,11 @@ def convert_to_standard_format(self, df: pd.DataFrame) -> tuple[pd.DataFrame, Di if self.parser.analysis_level == "ion": try: df["precursor ion"] = df["proforma"] + "|Z=" + df["Charge"].astype(str) - except KeyError: + except KeyError as e: raise KeyError( "Not all columns required for making the ion are available." " Is the charge available in the input file?" - ) + ) from e return df, replicate_to_raw diff --git a/proteobench/modules/quant/lfq/ion/DDA/quant_lfq_ion_DDA.py b/proteobench/modules/quant/lfq/ion/DDA/quant_lfq_ion_DDA.py index a2a6af77..2766784a 100644 --- a/proteobench/modules/quant/lfq/ion/DDA/quant_lfq_ion_DDA.py +++ b/proteobench/modules/quant/lfq/ion/DDA/quant_lfq_ion_DDA.py @@ -113,42 +113,45 @@ def benchmarking( except pd.errors.ParserError as e: raise ParseError( f"Error parsing {input_format} file, please make sure the format is correct and the correct software tool is chosen: {e}" - ) + ) from e except Exception as e: - raise ParseSettingsError(f"Error parsing the input file: {e}") + raise ParseSettingsError("Error parsing the input file.") from e + msg = f"Folder: {self.parse_settings_dir}, Module: {self.module_id}" # Parse settings file try: parse_settings = ParseSettingsBuilder( parse_settings_dir=self.parse_settings_dir, module_id=self.module_id ).build_parser(input_format) except KeyError as e: - raise ParseSettingsError(f"Error parsing settings file for parsing, settings seem to be missing: {e}") + raise ParseSettingsError( + f"Error parsing settings file for parsing, settings seem to be missing: {msg}" + ) from e except FileNotFoundError as e: - raise ParseSettingsError(f"Could not find the parsing settings file: {e}") + raise ParseSettingsError(f"Could not find the parsing settings file: {msg}") from e except Exception as e: - raise ParseSettingsError(f"Error parsing settings file for parsing: {e}") + raise ParseSettingsError(f"Error parsing settings file for parsing: {msg}") from e try: standard_format, replicate_to_raw = parse_settings.convert_to_standard_format(input_df) except KeyError as e: - raise ConvertStandardFormatError(f"Error converting to standard format, key missing: {e}") + raise ConvertStandardFormatError("Error converting to standard format, key missing.") from e except Exception as e: - raise ConvertStandardFormatError(f"Error converting to standard format: {e}") + raise ConvertStandardFormatError("Error converting to standard format.") from e - # calculate quantification scores + # instantiate quantification scores try: quant_score = QuantScores( self.precursor_name, parse_settings.species_expected_ratio(), parse_settings.species_dict() ) except Exception as e: - raise QuantificationError(f"Error generating quantification scores: {e}") + raise QuantificationError("Error generating quantification scores.") from e # generate intermediate data structure try: intermediate_data_structure = quant_score.generate_intermediate(standard_format, replicate_to_raw) except Exception as e: - raise IntermediateFormatGenerationError(f"Error generating intermediate data structure: {e}") + raise IntermediateFormatGenerationError("Error generating intermediate data structure.") from e # try: current_datapoint = Datapoint.generate_datapoint( From 03215953f6986f7f2daa866761c24aeaec91e825 Mon Sep 17 00:00:00 2001 From: Henry Webel Date: Sun, 26 Jan 2025 16:47:26 +0100 Subject: [PATCH 4/8] :bug: add proforma manuelly as no modification parsing is specified. ParseModifications. logic in proteobench/io/parsing/parse_settings.py add 'proforma' column which is required. --- proteobench/io/parsing/parse_ion.py | 2 +- proteobench/plotting/plot_quant.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/proteobench/io/parsing/parse_ion.py b/proteobench/io/parsing/parse_ion.py index a4f71fe6..3e41c8fd 100644 --- a/proteobench/io/parsing/parse_ion.py +++ b/proteobench/io/parsing/parse_ion.py @@ -111,7 +111,7 @@ def load_input_file(input_csv: str, input_format: str) -> pd.DataFrame: elif input_format == "quantms": input_data_frame = pd.read_csv(input_csv, low_memory=False) input_data_frame = input_data_frame.assign( - Sequence=input_data_frame["PeptideSequence"].str.replace( + proforma=input_data_frame["PeptideSequence"].str.replace( r"\(([^)]+)\)", r"", regex=True, diff --git a/proteobench/plotting/plot_quant.py b/proteobench/plotting/plot_quant.py index b714e189..fe9598a3 100644 --- a/proteobench/plotting/plot_quant.py +++ b/proteobench/plotting/plot_quant.py @@ -89,6 +89,7 @@ def plot_metric( "MSAID": "#afff57", "Proteome Discoverer": "#8c564b", "PEAKS": "#f781bf", + "quantms": "#03fc39", }, mapping: Dict[str, int] = {"old": 10, "new": 20}, highlight_color: str = "#d30067", From 49e792998e5368066eea0efedd04fa061e466152 Mon Sep 17 00:00:00 2001 From: Henry Webel Date: Sun, 26 Jan 2025 17:01:49 +0100 Subject: [PATCH 5/8] :bug: move to separate issue #556 --- proteobench/io/parsing/parse_settings.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/proteobench/io/parsing/parse_settings.py b/proteobench/io/parsing/parse_settings.py index 1e99dc3c..033354c8 100644 --- a/proteobench/io/parsing/parse_settings.py +++ b/proteobench/io/parsing/parse_settings.py @@ -171,14 +171,16 @@ def convert_to_standard_format(self, df: pd.DataFrame) -> tuple[pd.DataFrame, Di df_filtered_melted["proforma"] + "|Z=" + df_filtered_melted["Charge"].astype(str) ) else: - raise ValueError("Not all columns required for making the ion are available: 'proforma' and 'Charge'.") + # ! raise ValueError + print("Not all columns required for making the ion are available: 'proforma' and 'Charge'.") return df_filtered_melted, replicate_to_raw elif self.analysis_level == "peptidoform": if "proforma" in df_filtered_melted.columns: df_filtered_melted["peptidoform"] = df_filtered_melted["proforma"] else: - raise ValueError("Not all columns required for making the peptidoform are available: 'proforma'.") + # ! raise ValueError + print("Not all columns required for making the peptidoform are available: 'proforma'.") return df_filtered_melted, replicate_to_raw else: From 38058a82342ae5d83dc77b09fa26a5b5e7f2d81e Mon Sep 17 00:00:00 2001 From: Henry Webel Date: Wed, 29 Jan 2025 12:54:57 +0100 Subject: [PATCH 6/8] =?UTF-8?q?=F0=9F=9A=A7=20Start=20file=20reading=20of?= =?UTF-8?q?=20quantms=20parameter=20files?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- proteobench/io/params/quantms.py | 147 ++++++++++++++++ ....nf_core_quantms_software_mqc_versions.yml | 30 ++++ test/params/quantms_1-3.sdrf_config.tsv | 7 + test/params/quantms_1-3_dev.json | 162 ++++++++++++++++++ 4 files changed, 346 insertions(+) create mode 100644 proteobench/io/params/quantms.py create mode 100644 test/params/quantms_1-3.nf_core_quantms_software_mqc_versions.yml create mode 100644 test/params/quantms_1-3.sdrf_config.tsv create mode 100644 test/params/quantms_1-3_dev.json diff --git a/proteobench/io/params/quantms.py b/proteobench/io/params/quantms.py new file mode 100644 index 00000000..ab6b9845 --- /dev/null +++ b/proteobench/io/params/quantms.py @@ -0,0 +1,147 @@ +"""quantms is a nextflow pipeline that execution depends on the settings in an +SDRF file. It is executed using a parameters file in JSON format. + +However, the version of packages are dumped to a versions yaml file. And some parameters +are taken from the SDRF file. +""" + +import json +import logging +import pathlib +from typing import IO, Union + +import pandas as pd +import yaml + +from proteobench.io.params import ProteoBenchParameters + +logger = logging.getLogger(__name__) + + +def load_versions(file: IO) -> dict: + """ + Load the versions of the tools used in the quantms pipeline. + """ + versions = yaml.safe_load(file) + return versions + + +def load_parsed_sdrf(file: Union[str, pathlib.Path, IO]) -> pd.DataFrame: + """ + Load the parsed SDRF file. + """ + return pd.read_csv(file, sep="\t") + + +def load_files(file1: IO, file2: IO, file3: IO) -> [dict, pd.DataFrame]: + """Load file independent of order they are provided in.""" + versions = None + sdrf = None + pipeline_params = None + for file in [file1, file2, file3]: + try: + _versions = load_versions(file) + if "Workflow" not in _versions: + logger.debug("Loaded other file.") + elif versions is None: + versions = _versions + continue + elif "custom_config_base" in _versions: + logger.debug("Loaded nextflow parameters file.") + else: + raise ValueError("Multiple version files provided.") + except yaml.YAMLError as e: + file.seek(0) + + try: + # file.seek(0) + _pipeline_params = json.load(file) + if pipeline_params is None: + pipeline_params = _pipeline_params + continue + else: + raise ValueError("Multiple parameter files provided.") + except json.JSONDecodeError as e: + print(e) + file.seek(0) + + try: + # file.seek(0) + _sdrf = load_parsed_sdrf(file) + if _sdrf.shape[1] == 1: + logger.debug("Loaded version or parameter file. Skip") + continue + elif sdrf is None: + sdrf = _sdrf + else: + raise ValueError("Multiple SDRF files provided.") + except pd.errors.EmptyDataError as e: + pass + return versions, sdrf, pipeline_params + + +def extract_params(file1: IO, file2: IO, file3: IO) -> ProteoBenchParameters: + """ + Extract parameters from the parsed SDRF and version file. We use both the parsed + SDRF file and the yaml file of versions to extract the parameters. The function + needs to be able to handle any order of files as the streamlit interfaces does + allow the user to select any order. + + This might be changed in a newer quantms version with one central parameters + file. + """ + versions, sdrf, pipeline_params = load_files(file1, file2, file3) + + params = ProteoBenchParameters() + params.software_name = "quantms" + params.software_version = versions["Workflow"]["bigbio/quantms"] + engines = list() + engines_version = list() + for key in versions: + if key.startswith("SEARCHENGINE"): + _engine = key.split("SEARCHENGINE")[-1].lower() + engines.append(_engine) + if _engine == "comet": + engines_version.append(versions[key]["Comet"]) + elif _engine == "msgf": + versions.append(versions[key]["msgf_plus"]) + else: + raise ValueError(f"Unknown search engine: {_engine}") + if engines: + params.search_engine = ",".join(engines) + if engines_version: + params.search_engine_version = ",".join(engines_version) + + return (versions, sdrf, pipeline_params, params) + + +if __name__ == "__main__": + + from pathlib import Path + + fpath1 = Path("../../../test/params/quantms_1-3.sdrf_config.tsv") + fpath2 = Path("../../../test/params/quantms_1-3.nf_core_quantms_software_mqc_versions.yml") + fpath3 = Path("../../../test/params/quantms_1-3_dev.json") + + # Extract parameters from the fileP + with open(fpath1, "r") as file1, open(fpath2, "r") as file2, open(fpath3, "r") as file3: + versions, sdrf, pipeline_params, params = extract_params(file1, file2, file3) + display(params.__dict__) + + import itertools + + permutations_fpath = list(itertools.permutations([fpath1, fpath2, fpath3])) + for file1, file2, file3 in permutations_fpath: + print(file1.name, file2.name, file3.name) + with open(file1, "r") as f1, open(file2, "r") as f2, open(file3, "r") as f3: + _versions, _sdrf, _pipeline_params, params = extract_params(f1, f2, f3) + assert _versions == versions + assert _sdrf.equals(sdrf) + assert _pipeline_params == pipeline_params + # display(params.__dict__) + + # Convert the extracted parameters to a dictioPnary and then to a pandas Series + # data_dict = params.__dict__ + # series = pd.Series(data_dict) + # # Write the Series to a CSV file + # series.to_csv(file.with_suffix(".csv")) diff --git a/test/params/quantms_1-3.nf_core_quantms_software_mqc_versions.yml b/test/params/quantms_1-3.nf_core_quantms_software_mqc_versions.yml new file mode 100644 index 00000000..7a53f716 --- /dev/null +++ b/test/params/quantms_1-3.nf_core_quantms_software_mqc_versions.yml @@ -0,0 +1,30 @@ +DECOYDATABASE: + DecoyDatabase: 3.2.0-pre-exported-20240919 +EXTRACTPSMFEATURES: + PSMFeatureExtractor: 3.2.0-pre-exported-20240919 +IDFILTER: + IDFilter: 3.2.0-pre-exported-20240919 +IDSCORESWITCHER: + IDScoreSwitcher: 3.2.0-pre-exported-20240919 +MSSTATS: + r-base: 4.3.2 + bioconductor-msstats: 4.10.0 +MZMLSTATISTICS: + quantms-utils: 0.0.18 +PERCOLATOR: + PercolatorAdapter: 3.2.0-pre-exported-20240919 + percolator: 3.05.0, Build Date Aug 31 2020 19:03:04 +PROTEOMICSLFQ: + ProteomicsLFQ: 3.2.0-pre-exported-20240919 +SAMPLESHEET_CHECK: + quantms-utils: 0.0.18 +SDRFPARSING: + sdrf-pipelines: 0.0.31 +SEARCHENGINECOMET: + CometAdapter: 3.2.0-pre-exported-20240919 + Comet: 2023.01 rev. 2 +THERMORAWFILEPARSER: + ThermoRawFileParser: 1.3.4 +Workflow: + bigbio/quantms: v1.3.1dev-g70337bc + Nextflow: 24.10.3 diff --git a/test/params/quantms_1-3.sdrf_config.tsv b/test/params/quantms_1-3.sdrf_config.tsv new file mode 100644 index 00000000..8cdc8655 --- /dev/null +++ b/test/params/quantms_1-3.sdrf_config.tsv @@ -0,0 +1,7 @@ +URI Filename FixedModifications VariableModifications Proteomics Data Acquisition Method Label PrecursorMassTolerance PrecursorMassToleranceUnit FragmentMassTolerance FragmentMassToleranceUnit DissociationMethod Enzyme +ftp://ftp.pride.ebi.ac.uk/pride/data/archive/2021/10/PXD010000/LFQ_Orbitrap_DDA_Condition_A_Sample_Alpha_01.raw LFQ_Orbitrap_DDA_Condition_A_Sample_Alpha_01.raw Carbamidomethyl (C) Data-Dependent Acquisition label free sample 10 ppm 20 ppm HCD Trypsin +ftp://ftp.pride.ebi.ac.uk/pride/data/archive/2021/10/PXD010000/LFQ_Orbitrap_DDA_Condition_A_Sample_Alpha_02.raw LFQ_Orbitrap_DDA_Condition_A_Sample_Alpha_02.raw Carbamidomethyl (C) Data-Dependent Acquisition label free sample 10 ppm 20 ppm HCD Trypsin +ftp://ftp.pride.ebi.ac.uk/pride/data/archive/2021/10/PXD010000/LFQ_Orbitrap_DDA_Condition_A_Sample_Alpha_03.raw LFQ_Orbitrap_DDA_Condition_A_Sample_Alpha_03.raw Carbamidomethyl (C) Data-Dependent Acquisition label free sample 10 ppm 20 ppm HCD Trypsin +ftp://ftp.pride.ebi.ac.uk/pride/data/archive/2021/10/PXD010000/LFQ_Orbitrap_DDA_Condition_B_Sample_Alpha_01.raw LFQ_Orbitrap_DDA_Condition_B_Sample_Alpha_01.raw Carbamidomethyl (C) Data-Dependent Acquisition label free sample 10 ppm 20 ppm HCD Trypsin +ftp://ftp.pride.ebi.ac.uk/pride/data/archive/2021/10/PXD010000/LFQ_Orbitrap_DDA_Condition_B_Sample_Alpha_02.raw LFQ_Orbitrap_DDA_Condition_B_Sample_Alpha_02.raw Carbamidomethyl (C) Data-Dependent Acquisition label free sample 10 ppm 20 ppm HCD Trypsin +ftp://ftp.pride.ebi.ac.uk/pride/data/archive/2021/10/PXD010000/LFQ_Orbitrap_DDA_Condition_B_Sample_Alpha_03.raw LFQ_Orbitrap_DDA_Condition_B_Sample_Alpha_03.raw Carbamidomethyl (C) Data-Dependent Acquisition label free sample 10 ppm 20 ppm HCD Trypsin diff --git a/test/params/quantms_1-3_dev.json b/test/params/quantms_1-3_dev.json new file mode 100644 index 00000000..29326c7f --- /dev/null +++ b/test/params/quantms_1-3_dev.json @@ -0,0 +1,162 @@ +{ + "custom_config_base": "https://raw.githubusercontent.com/nf-core/configs/master", + "min_peptide_length": 6, + "alignment_order": "star", + "fdr_level": "psm_level_fdrs", + "msstatslfq_removeFewMeasurements": true, + "plaintext_email": false, + "luciphor_debug": 0, + "isotope_correction": false, + "extractpsmfeature_debug": 0, + "msstatsiso_rmpsm_withfewmea_withinrun": true, + "subset_max_train": 300000, + "protein_score": "best", + "feature_with_id_min_score": 0.1, + "ms2rescore": false, + "shuffle_sequence_identity_threshold": 0.5, + "protein_inference_method": "aggregation", + "min_corr": 2, + "reindex_mzml": true, + "min_reporter_intensity": 0, + "normalize": false, + "skip_preliminary_analysis": false, + "description_correct_features": 0, + "decoy_string": "DECOY_", + "variable_mods": "Oxidation (M)", + "fragment_mass_tolerance": 0.03, + "msstatslfq_quant_summary_method": "TMP", + "skip_factor_validation": true, + "psm_level_fdr_cutoff": 0.01, + "skip_table_plots": false, + "scan_window_automatic": true, + "corr_diff": 1, + "enable_diann_mztab": true, + "pmultiqc_idxml_skip": true, + "version": false, + "openms_peakpicking": false, + "publish_dir_mode": "copy", + "input": "az://seqera/proteobench_dda_quant/dda_lfq_proteobench_v1.sdrf.tsv", + "feature_without_id_min_score": 0.75, + "msstatsiso_remove_norm_channel": true, + "min_precursor_charge": 2, + "consensusid_algorithm": "best", + "protein_quant": "unique_peptides", + "min_peptides_per_protein": 1, + "precursor_isotope_deviation": 10, + "num_hits": 1, + "precursor_mass_tolerance": 5, + "average": "median", + "decoy_method": "reverse", + "allowed_missed_cleavages": 2, + "max_peptide_length": 40, + "iso_normalization": false, + "protein_level_fdr_cutoff": 0.01, + "random_preanalysis": false, + "diann_debug": 3, + "mass_acc_automatic": true, + "custom_config_version": "master", + "update_PSM_probabilities": false, + "feature_generators": "deeplc,ms2pip", + "msstats_remove_one_feat_prot": true, + "top": 3, + "fixed_mods": "Carbamidomethyl (C)", + "msstatsiso_summaryformultiple_psm": "sum", + "msstats_plot_profile_qc": false, + "root_folder": "az://seqera/test-data/LFQ_DDA/raw/", + "pp_debug": 0, + "email": "heweb@dtu.dk", + "fix_peptides": false, + "pg_level": 2, + "use_ols_cache_only": false, + "IL_equivalent": true, + "acquisition_method": "dda", + "empirical_assembly_ms_n": 200, + "peakpicking_inmemory": false, + "run_fdr_cutoff": 0.1, + "lfq_intensity_threshold": 1000, + "protein_inference_debug": 0, + "local_input_type": "raw", + "quantification_method": "feature_intensity", + "enable_pmultiqc": true, + "outdir": "az://seqera/results_msquant_proteobench_raw", + "use_shared_peptides": true, + "pipelines_testdata_base_path": "https://raw.githubusercontent.com/nf-core/test-datasets/", + "sage_processes": 1, + "help": false, + "min_precursor_purity": 0, + "enable_mod_localization": false, + "train_FDR": 0.05, + "skip_ms_validation": false, + "export_mztab": true, + "klammer": false, + "search_engines": "comet", + "idfilter_debug": 0, + "msstats_threshold": 0.05, + "monochrome_logs": false, + "diann_normalize": true, + "test_FDR": 0.05, + "precursor_mass_tolerance_unit": "ppm", + "protocol": "automatic", + "skip_experimental_design_validation": false, + "add_triqler_output": false, + "targeted_only": true, + "max_multiqc_email_size": "25.MB", + "msstatsiso_useunique_peptide": true, + "min_consensus_support": 0, + "time_corr_only": true, + "max_precursor_charge": 4, + "validate_params": true, + "consensusid_debug": 0, + "min_peaks": 10, + "select_activation": "HCD", + "isotope_error_range": "0,1", + "best_charge_and_fraction": false, + "mod_localization": "Phospho (S),Phospho (T),Phospho (Y)", + "num_enzyme_termini": "fully", + "add_decoys": true, + "percolator_debug": 0, + "fragment_mass_tolerance_unit": "Da", + "msstatsiso_summarization_method": "msstats", + "trace_report_suffix": "2025-01-23_16-38-37", + "db_debug": 0, + "min_precursor_intensity": 1, + "export_decoy_psm": true, + "species_genes": false, + "ms2pip_model": "HCD2021", + "picked_fdr": true, + "scan_window": 8, + "iso_debug": 0, + "msstatsiso_global_norm": true, + "msstatslfq_feature_subset_protein": "top3", + "mass_recalibration": false, + "shuffle_max_attempts": 30, + "protein_quant_debug": 0, + "id_only": false, + "database": "az://seqera/proteobench_dda_quant/ProteoBenchFASTA_DDAQuantification.fasta", + "reporter_mass_shift": 0.002, + "rescore_range": "independent_run", + "random_preanalysis_seed": 42, + "calibration_set_size": 0.15, + "quantify_decoys": false, + "reference_channel": "126", + "contrasts": "pairwise", + "idmapper_debug": 0, + "ratios": false, + "enzyme": "Trypsin", + "include_all": true, + "add_snr_feature_percolator": false, + "validate_ontologies": true, + "consensusid_considered_top_hits": 0, + "top_PSMs": 1, + "convert_dotd": false, + "unmatched_action": "warn", + "skip_rescoring": false, + "msstatsiso_reference_normalization": true, + "idscoreswitcher_debug": 0, + "skip_post_msstats": false, + "decoy_string_position": "prefix", + "max_mods": 3, + "decoydatabase_debug": 0, + "plfq_debug": 0, + "fragment_method": "HCD" +} \ No newline at end of file From 05f71b072c11613930d4689d33ed4b45444306af Mon Sep 17 00:00:00 2001 From: Henry Webel Date: Sun, 2 Feb 2025 16:55:53 +0100 Subject: [PATCH 7/8] :bug: json could not be reloaded --- proteobench/io/params/quantms.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/proteobench/io/params/quantms.py b/proteobench/io/params/quantms.py index ab6b9845..b48daf4c 100644 --- a/proteobench/io/params/quantms.py +++ b/proteobench/io/params/quantms.py @@ -43,6 +43,7 @@ def load_files(file1: IO, file2: IO, file3: IO) -> [dict, pd.DataFrame]: _versions = load_versions(file) if "Workflow" not in _versions: logger.debug("Loaded other file.") + file.seek(0) elif versions is None: versions = _versions continue @@ -77,6 +78,11 @@ def load_files(file1: IO, file2: IO, file3: IO) -> [dict, pd.DataFrame]: raise ValueError("Multiple SDRF files provided.") except pd.errors.EmptyDataError as e: pass + + assert versions is not None + assert sdrf is not None + assert pipeline_params is not None + return versions, sdrf, pipeline_params From 038fcf8509f4bfcb7c8ed19c0070ec0183a3d460 Mon Sep 17 00:00:00 2001 From: Henry Webel Date: Sun, 2 Feb 2025 16:56:17 +0100 Subject: [PATCH 8/8] =?UTF-8?q?=F0=9F=9A=A7=20continue=20mapping=20paramet?= =?UTF-8?q?ers?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- proteobench/io/params/quantms.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/proteobench/io/params/quantms.py b/proteobench/io/params/quantms.py index b48daf4c..02511e5f 100644 --- a/proteobench/io/params/quantms.py +++ b/proteobench/io/params/quantms.py @@ -118,6 +118,27 @@ def extract_params(file1: IO, file2: IO, file3: IO) -> ProteoBenchParameters: if engines_version: params.search_engine_version = ",".join(engines_version) + # "fdr_level": "psm_level_fdrs", + params.ident_fdr_psm = pipeline_params["psm_level_fdr_cutoff"] + params.ident_fdr_protein = pipeline_params["protein_level_fdr_cutoff"] + params.variable_mods = pipeline_params["variable_mods"] + params.fixed_mods = pipeline_params["fixed_mods"] + params.max_mods = pipeline_params["max_mods"] + params.min_precursor_charge = pipeline_params["min_precursor_charge"] + params.max_precursor_charge = pipeline_params["max_precursor_charge"] + params.max_peptide_length = pipeline_params["max_peptide_length"] + params.min_peptide_length = pipeline_params["min_peptide_length"] + params.precursor_mass_tolerance = pipeline_params["precursor_mass_tolerance"] + params.fragment_mass_tolerance = pipeline_params["fragment_mass_tolerance"] + params.allowed_miscleavages = pipeline_params["allowed_missed_cleavages"] + params.quantification_method = pipeline_params["quantification_method"] + params.protein_inference = pipeline_params["protein_inference_method"] + + # maybe (also) in sdrf infos? + # params.quantification_method = + # params.protein_inference = + # params.abundance_normalization_ions = + return (versions, sdrf, pipeline_params, params)