Skip to content

Commit

Permalink
✨ DataFrame and csv format for MQ parameters
Browse files Browse the repository at this point in the history
- potential: allow to combine MaxQuant parameter files
- easy to inspect csv parameter file

ToDo:
- could update the 4th index level to reflect some of the groups
 -> see comments in maxquant.py
  • Loading branch information
Henry committed Sep 26, 2023
1 parent d7e38b7 commit 532157f
Show file tree
Hide file tree
Showing 5 changed files with 1,180 additions and 0 deletions.
68 changes: 68 additions & 0 deletions proteobench/io/params/maxquant.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,14 @@
"""Functionality to parse Maxqunt mqpar.xml parameter files"""
from __future__ import annotations

import collections
import json
import logging
import xml.etree.ElementTree as ET
from pathlib import Path

import pandas as pd

logger = logging.getLogger()


Expand Down Expand Up @@ -82,6 +85,48 @@ def read_file(file: str) -> dict:
return params


def flatten_dict_of_dicts(d: dict, parent_key: str = "") -> dict:
"""Build tuples for nested dictionaries for use as `pandas.MultiIndex`.
Parameters
----------
d : dict
Nested dictionary for which all keys are flattened to tuples.
parent_key : str, optional
Outer key (used for recursion), by default ''
Returns
-------
dict
Flattend dictionary with tuple keys: {(outer_key, ..., inner_key) : value}
"""
# simplified and adapted from: https://stackoverflow.com/a/6027615/9684872
items = []
for k, v in d.items():
new_key = parent_key + (k,) if parent_key else (k,)
if isinstance(v, collections.abc.MutableMapping):
items.extend(flatten_dict_of_dicts(v, parent_key=new_key))
elif isinstance(v, list):
for item in v:
if isinstance(item, collections.abc.MutableMapping):
items.extend(flatten_dict_of_dicts(item, parent_key=new_key))
elif isinstance(item, str):
items.append((new_key, item))
else:
raise ValueError(f"Unknown item: {item:r}")
else:
items.append((new_key, v))
return items


def build_Series_from_records(records, index_length=4):
records = flatten_dict_of_dicts(records)
idx = pd.MultiIndex.from_tuples(
(extend_tuple(k, index_length) for (k, v) in records)
)
return pd.Series((v for (k, v) in records), index=idx)


# create a first version of json files to match
if __name__ == "__main__":
for test_file in [
Expand All @@ -101,3 +146,26 @@ def read_file(file: str) -> dict:
)
)
)
flattend = build_Series_from_records(record_example, 4)
flattend = flattend.to_frame("run_identifier")
flattend.to_csv(Path(test_file).with_suffix(".csv"))

# %%
int(
flattend.loc["parameterGroups"]
.loc["parameterGroup"]
.loc["firstSearchTol"]
.squeeze()
)

# %%
# ! Parse msmsParamsArray
ms2_params = (
flattend.loc["msmsParamsArray"].loc["msmsParams"].reset_index(-1, drop=True)
)
ms2_params.loc["Name", "mode"] = ms2_params.loc["Name"].squeeze()
ms2_params["mode"] = ms2_params["mode"].fillna(method="ffill")
ms2_params = ms2_params.set_index("mode", append=True)
ms2_params.loc[("MatchTolerance", "FTMS")]
# ? reset_index level -1
# ? update and fillna -> then set as index again
204 changes: 204 additions & 0 deletions test/params/mqpar1.5.3.30_MBR.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,204 @@
,,,,run_identifier
name,,,,Session1
maxQuantVersion,,,,1.5.3.30
tempFolder,,,,
numThreads,,,,3
sendEmail,,,,false
fixedCombinedFolder,,,,
ionCountIntensities,,,,false
verboseColumnHeaders,,,,false
fullMinMz,,,,-1.7976931348623157E+308
fullMaxMz,,,,1.7976931348623157E+308
calcPeakProperties,,,,false
showCentroidMassDifferences,,,,false
showIsotopeMassDifferences,,,,false
filePaths,string,,,"/users/user/EuBIC benchmarking
project\MQ15330_MBR\LFQ_Orbitrap_DDA_Condition_A_Sample_Alpha_01.raw"
filePaths,string,,,"/users/user/EuBIC benchmarking
project\MQ15330_MBR\LFQ_Orbitrap_DDA_Condition_A_Sample_Alpha_02.raw"
filePaths,string,,,"/users/user/EuBIC benchmarking
project\MQ15330_MBR\LFQ_Orbitrap_DDA_Condition_A_Sample_Alpha_03.raw"
filePaths,string,,,"/users/user/EuBIC benchmarking
project\MQ15330_MBR\LFQ_Orbitrap_DDA_Condition_B_Sample_Alpha_01.raw"
filePaths,string,,,"/users/user/EuBIC benchmarking
project\MQ15330_MBR\LFQ_Orbitrap_DDA_Condition_B_Sample_Alpha_02.raw"
filePaths,string,,,"/users/user/EuBIC benchmarking
project\MQ15330_MBR\LFQ_Orbitrap_DDA_Condition_B_Sample_Alpha_03.raw"
experiments,string,,,A_Sample_Alpha_01
experiments,string,,,A_Sample_Alpha_02
experiments,string,,,A_Sample_Alpha_03
experiments,string,,,B_Sample_Alpha_01
experiments,string,,,B_Sample_Alpha_02
experiments,string,,,B_Sample_Alpha_03
fractions,short,,,32767
fractions,short,,,32767
fractions,short,,,32767
fractions,short,,,32767
fractions,short,,,32767
fractions,short,,,32767
paramGroupIndices,int,,,0
paramGroupIndices,int,,,0
paramGroupIndices,int,,,0
paramGroupIndices,int,,,0
paramGroupIndices,int,,,0
paramGroupIndices,int,,,0
parameterGroups,parameterGroup,maxCharge,,7
parameterGroups,parameterGroup,minPeakLen,,2
parameterGroups,parameterGroup,useMs1Centroids,,false
parameterGroups,parameterGroup,useMs2Centroids,,false
parameterGroups,parameterGroup,cutPeaks,,true
parameterGroups,parameterGroup,gapScans,,1
parameterGroups,parameterGroup,minTime,,NaN
parameterGroups,parameterGroup,maxTime,,NaN
parameterGroups,parameterGroup,matchType,,MatchFromAndTo
parameterGroups,parameterGroup,centroidMatchTol,,8
parameterGroups,parameterGroup,centroidMatchTolInPpm,,true
parameterGroups,parameterGroup,centroidHalfWidth,,35
parameterGroups,parameterGroup,centroidHalfWidthInPpm,,true
parameterGroups,parameterGroup,valleyFactor,,1.4
parameterGroups,parameterGroup,advancedPeakSplitting,,false
parameterGroups,parameterGroup,intensityThreshold,,500
parameterGroups,parameterGroup,msInstrument,,0
parameterGroups,parameterGroup,intensityDetermination,,0
parameterGroups,parameterGroup,labelMods,string,
parameterGroups,parameterGroup,reQuantify,,false
parameterGroups,parameterGroup,lfqSkipNorm,,false
parameterGroups,parameterGroup,lfqMinEdgesPerNode,,3
parameterGroups,parameterGroup,lfqAvEdgesPerNode,,6
parameterGroups,parameterGroup,lfqMaxFeatures,,100000
parameterGroups,parameterGroup,fastLfq,,true
parameterGroups,parameterGroup,lfqRestrictFeatures,,false
parameterGroups,parameterGroup,lfqMinRatioCount,,2
parameterGroups,parameterGroup,useNormRatiosForHybridLfq,,true
parameterGroups,parameterGroup,maxLabeledAa,,0
parameterGroups,parameterGroup,maxNmods,,5
parameterGroups,parameterGroup,maxMissedCleavages,,2
parameterGroups,parameterGroup,multiplicity,,1
parameterGroups,parameterGroup,enzymes,string,Trypsin/P
parameterGroups,parameterGroup,enzymesFirstSearch,,
parameterGroups,parameterGroup,useEnzymeFirstSearch,,false
parameterGroups,parameterGroup,useVariableModificationsFirstSearch,,false
parameterGroups,parameterGroup,variableModifications,string,Oxidation (M)
parameterGroups,parameterGroup,variableModifications,string,Acetyl (Protein N-term)
parameterGroups,parameterGroup,useMultiModification,,false
parameterGroups,parameterGroup,multiModifications,,
parameterGroups,parameterGroup,isobaricLabels,,
parameterGroups,parameterGroup,variableModificationsFirstSearch,,
parameterGroups,parameterGroup,hasAdditionalVariableModifications,,false
parameterGroups,parameterGroup,additionalVariableModifications,,
parameterGroups,parameterGroup,additionalVariableModificationProteins,,
parameterGroups,parameterGroup,doMassFiltering,,true
parameterGroups,parameterGroup,firstSearchTol,,20
parameterGroups,parameterGroup,mainSearchTol,,4.5
parameterGroups,parameterGroup,searchTolInPpm,,true
parameterGroups,parameterGroup,isotopeMatchTol,,2
parameterGroups,parameterGroup,isotopeMatchTolInPpm,,true
parameterGroups,parameterGroup,isotopeTimeCorrelation,,0.6
parameterGroups,parameterGroup,theorIsotopeCorrelation,,0.6
parameterGroups,parameterGroup,recalibrationInPpm,,true
parameterGroups,parameterGroup,intensityDependentCalibration,,false
parameterGroups,parameterGroup,minScoreForCalibration,,70
parameterGroups,parameterGroup,matchLibraryFile,,false
parameterGroups,parameterGroup,libraryFile,,
parameterGroups,parameterGroup,matchLibraryMassTolPpm,,0
parameterGroups,parameterGroup,matchLibraryTimeTolMin,,0
parameterGroups,parameterGroup,matchLabelTimeTolMin,,0
parameterGroups,parameterGroup,reporterMassTolerance,,NaN
parameterGroups,parameterGroup,reporterPif,,NaN
parameterGroups,parameterGroup,filterPif,,false
parameterGroups,parameterGroup,reporterFraction,,NaN
parameterGroups,parameterGroup,reporterBasePeakRatio,,NaN
parameterGroups,parameterGroup,timsHalfWidth,,0
parameterGroups,parameterGroup,timsStep,,0
parameterGroups,parameterGroup,timsResolution,,0
parameterGroups,parameterGroup,timsMinMsmsIntensity,,0
parameterGroups,parameterGroup,timsRemovePrecursor,,true
parameterGroups,parameterGroup,crosslinkSearch,,false
parameterGroups,parameterGroup,crosslinkMaxMonoUnsaturated,,0
parameterGroups,parameterGroup,crosslinkMaxMonoSaturated,,0
parameterGroups,parameterGroup,crosslinkMaxDiUnsaturated,,0
parameterGroups,parameterGroup,crosslinkMaxDiSaturated,,0
parameterGroups,parameterGroup,crosslinkUseSeparateFasta,,false
parameterGroups,parameterGroup,crosslinkFastaFiles,,
parameterGroups,parameterGroup,crosslinkMode,,PeptidesWithCleavedLinker
parameterGroups,parameterGroup,lcmsRunType,,Standard
parameterGroups,parameterGroup,lfqMode,,1
parameterGroups,parameterGroup,enzymeMode,,0
parameterGroups,parameterGroup,enzymeModeFirstSearch,,0
fixedModifications,string,,,Carbamidomethyl (C)
fastaFiles,string,,,"/users/user/EuBIC benchmarking
project\MQ15330_MBR\BenchmarkFASTAModule1_DDA_NOCONTA.fasta"
fastaFilesFirstSearch,,,,
fixedSearchFolder,,,,
advancedRatios,,,,true
rtShift,,,,false
separateLfq,,,,false
lfqStabilizeLargeRatios,,,,true
lfqRequireMsms,,,,true
decoyMode,,,,revert
includeContaminants,,,,true
topxWindow,,,,100
maxPeptideMass,,,,4600
epsilonMutationScore,,,,true
mutatedPeptidesSeparately,,,,true
minDeltaScoreUnmodifiedPeptides,,,,0
minDeltaScoreModifiedPeptides,,,,6
minScoreUnmodifiedPeptides,,,,0
minScoreModifiedPeptides,,,,40
secondPeptide,,,,true
matchBetweenRuns,,,,true
matchUnidentifiedFeatures,,,,false
matchBetweenRunsFdr,,,,false
dependentPeptides,,,,false
dependentPeptideFdr,,,,0
dependentPeptideMassBin,,,,0
msmsConnection,,,,false
ibaq,,,,false
useDeltaScore,,,,false
splitProteinGroupsByTaxonomy,,,,false
taxonomyLevel,,,,Species
avalon,,,,false
ibaqLogFit,,,,false
razorProteinFdr,,,,true
deNovoSequencing,,,,false
deNovoVarMods,,,,true
massDifferenceSearch,,,,false
minPepLen,,,,7
peptideFdr,,,,0.01
proteinFdr,,,,0.01
siteFdr,,,,0.01
minPeptideLengthForUnspecificSearch,,,,8
maxPeptideLengthForUnspecificSearch,,,,25
useNormRatiosForOccupancy,,,,true
minPeptides,,,,1
minRazorPeptides,,,,1
minUniquePeptides,,,,0
useCounterparts,,,,false
advancedSiteIntensities,,,,true
customProteinQuantification,,,,false
customProteinQuantificationFile,,,,
minRatioCount,,,,2
restrictProteinQuantification,,,,true
restrictMods,string,,,Oxidation (M)
restrictMods,string,,,Acetyl (Protein N-term)
matchingTimeWindow,,,,0.7
alignmentTimeWindow,,,,20
numberOfCandidatesMultiplexedMsms,,,,25
numberOfCandidatesMsms,,,,15
massDifferenceMods,,,,
mainSearchMaxCombinations,,,,200
msmsParamsArray,msmsParams,MatchTolerance,,20
msmsParamsArray,msmsParams,DeisotopeTolerance,,7
msmsParamsArray,msmsParams,DeNovoTolerance,,10
msmsParamsArray,msmsParams,MatchTolerance,,0.5
msmsParamsArray,msmsParams,DeisotopeTolerance,,0.15
msmsParamsArray,msmsParams,DeNovoTolerance,,0.25
msmsParamsArray,msmsParams,MatchTolerance,,40
msmsParamsArray,msmsParams,DeisotopeTolerance,,0.01
msmsParamsArray,msmsParams,DeNovoTolerance,,0.02
msmsParamsArray,msmsParams,MatchTolerance,,0.5
msmsParamsArray,msmsParams,DeisotopeTolerance,,0.15
msmsParamsArray,msmsParams,DeNovoTolerance,,0.25
compositionPrediction,,,,0
quantMode,,,,1
variationMode,,,,none
Loading

0 comments on commit 532157f

Please sign in to comment.