diff --git a/.gitignore b/.gitignore index e2a06e8..bfcef11 100644 --- a/.gitignore +++ b/.gitignore @@ -161,3 +161,4 @@ cython_debug/ *.csv *_df.csv *.tsv +/tests/test_data/hMICAL1_coiPAnP-N2-200_3Murea-1Mthiourea-200mMtcep_14733.d/ diff --git a/pyproject.toml b/pyproject.toml index ca5d136..05c39ea 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -3,7 +3,7 @@ name = "quantms-utils" description = "Python scripts and helpers for the quantMS workflow" readme = "README.md" license = "MIT" -version = "0.0.15" +version = "0.0.16" authors = [ "Yasset Perez-Riverol ", "Dai Chengxin ", diff --git a/quantmsutils/__init__.py b/quantmsutils/__init__.py index 6561790..d62d967 100644 --- a/quantmsutils/__init__.py +++ b/quantmsutils/__init__.py @@ -1 +1 @@ -__version__ = "0.0.15" +__version__ = "0.0.16" diff --git a/quantmsutils/mzml/mzml_statistics.py b/quantmsutils/mzml/mzml_statistics.py index 1b3d706..9fd349d 100644 --- a/quantmsutils/mzml/mzml_statistics.py +++ b/quantmsutils/mzml/mzml_statistics.py @@ -1,238 +1,376 @@ import re import sqlite3 from pathlib import Path +from typing import Optional, List import click +import numpy as np import pandas as pd -import pyarrow -from pyopenms import MSExperiment, MzMLFile +import pyarrow as pa +import pyarrow.parquet as pq +from pyopenms import MzMLFile + + +class BatchWritingConsumer: + """ + A class to consume mass spectrometry data and write to a parquet file in batches from mzML files using + pyopenms streaming. + """ + + def __init__( + self, + parquet_schema: pa.Schema, + id_parquet_schema: pa.Schema, + output_path, + batch_size=10000, + id_only=False, + ): + self.parquet_schema = parquet_schema + self.id_parquet_schema = id_parquet_schema + self.output_path = output_path + self.batch_size = batch_size + self.id_only = id_only + self.batch_data = [] + self.psm_parts = [] + self.parquet_writer = None + self.id_parquet_writer = None + self.acquisition_datetime = None + self.scan_pattern = re.compile(r"[scan|spectrum]=(\d+)") + + def setExperimentalSettings(self, settings): + self.acquisition_datetime = settings.getDateTime().get() + + def setExpectedSize(self, a, b): + pass + + def consumeChromatogram(self, chromatogram): + pass + + def consumeSpectrum(self, spectrum): + """ + Consume spectrum data and write to parquet file. + :param spectrum: spectrum data. + :return: None + """ + + peaks = spectrum.get_peaks() + mz_array, intensity_array = peaks[0], peaks[1] + peak_per_ms = len(mz_array) + base_peak_intensity = float(np.max(intensity_array)) if peak_per_ms > 0 else None + total_intensity = float(np.sum(intensity_array)) if peak_per_ms > 0 else None + ms_level = spectrum.getMSLevel() + rt = spectrum.getRT() + + if ms_level == 2: + precursor = spectrum.getPrecursors()[0] + charge_state = precursor.getCharge() + exp_mz = precursor.getMZ() + + if self.id_only: + scan_id = self.scan_pattern.findall(spectrum.getNativeID())[0] + self.psm_parts.append( + [ + { + "scan": scan_id, + "ms_level": ms_level, + "mz": mz_array, + "intensity": intensity_array, + } + ] + ) + + row_data = { + "SpectrumID": spectrum.getNativeID(), + "MSLevel": float(ms_level), + "Charge": float(charge_state) if charge_state is not None else None, + "MS_peaks": float(peak_per_ms), + "Base_Peak_Intensity": ( + float(base_peak_intensity) if base_peak_intensity is not None else None + ), + "Summed_Peak_Intensities": ( + float(total_intensity) if total_intensity is not None else None + ), + "Retention_Time": float(rt), + "Exp_Mass_To_Charge": float(exp_mz) if exp_mz is not None else None, + "AcquisitionDateTime": str(self.acquisition_datetime), + } + elif ms_level == 1: + row_data = { + "SpectrumID": spectrum.getNativeID(), + "MSLevel": float(ms_level), + "Charge": None, + "MS_peaks": float(peak_per_ms), + "Base_Peak_Intensity": ( + float(base_peak_intensity) if base_peak_intensity is not None else None + ), + "Summed_Peak_Intensities": ( + float(total_intensity) if total_intensity is not None else None + ), + "Retention_Time": float(rt), + "Exp_Mass_To_Charge": None, + "AcquisitionDateTime": str(self.acquisition_datetime), + } + else: + return + + self.batch_data.append(row_data) + + # Write batch when it reaches specified size + if len(self.batch_data) >= self.batch_size: + self._write_batch() + + def _write_batch(self): + """ + Write accumulated batch data more efficiently using PyArrow's streaming writer. + + Improvements: + - Directly stream data without creating a full in-memory table + - Reduce memory overhead for large datasets + - More efficient batch processing + """ + try: + # If no data, return early + if not self.batch_data: + return + + # Initialize writers lazily if not already created + if self.parquet_writer is None: + self.parquet_writer = pq.ParquetWriter( + where=self.output_path, schema=self.parquet_schema, compression="gzip" + ) + + # Create a RecordBatch directly from the current batch + batch = pa.RecordBatch.from_pylist(self.batch_data, schema=self.parquet_schema) + + # Write the batch directly + self.parquet_writer.write_batch(batch) + + # Clear the batch data + self.batch_data = [] + + # Handle ID-only data if applicable + if self.id_only and self.psm_parts: + # Similar approach for spectrum ID data + if self.id_parquet_writer is None: + self.id_parquet_writer = pq.ParquetWriter( + where=f"{Path(self.output_path).stem}_spectrum_df.parquet", + schema=self.id_parquet_schema, + compression="gzip", + ) + + id_batch = pa.RecordBatch.from_pylist( + self.psm_parts, schema=self.id_parquet_schema + ) + self.id_parquet_writer.write_batch(id_batch) + self.psm_parts = [] + + except Exception as e: + print(f"Error during batch writing: {e}") + raise + + def finalize(self): + """ + Finalize the writing process. + :return: + """ + if self.batch_data: + self._write_batch() + + # Write spectrum data if id_only + if self.id_only and self.psm_parts: + self._write_batch() + + if self.parquet_writer: + self.parquet_writer.close() + + if self.id_parquet_writer: + self.id_parquet_writer.close() + + +def column_exists(conn, table_name: str) -> List[str]: + """ + Fetch the existing columns in the specified SQLite table. + """ + table_info = pd.read_sql_query(f"PRAGMA table_info({table_name});", conn) + return set(table_info["name"].tolist()) @click.command("mzmlstats") -@click.option("--ms_path", type=click.Path(exists=True)) +@click.option("--ms_path", type=click.Path(exists=True), required=True) +@click.option("--id_only", is_flag=True, help="Generate a csv with the spectrum id and the peaks") @click.option( - "--id_only", is_flag=True, help="Generate a csv with the spectrum id and the peaks" + "--batch_size", type=int, default=10000, help="Number of rows to write in each batch" ) @click.pass_context -def mzml_statistics(ctx, ms_path: str, id_only: bool = False) -> None: +def mzml_statistics(ctx, ms_path: str, id_only: bool = False, batch_size: int = 10000) -> None: """ The mzml_statistics function parses mass spectrometry data files, either in .mzML or Bruker .d formats, to extract and compile a set of statistics about the spectra contained within. It supports generating detailed or ID-only CSV files based on the spectra data. # Command line usage example - python script_name.py mzml_statistics --ms_path "path/to/file.mzML" + quantmsutilsc mzmlstats --ms_path "path/to/file.mzML" :param ctx: Click context + :param ms_path: A string specifying the path to the mass spectrometry file. :param id_only: A boolean flag that, when set to True, generates a CSV file containing only the spectrum ID and peaks data for MS level 2 spectra. + :param batch_size: An integer specifying the number of rows to write in each batch. """ - file_columns = [ - "SpectrumID", - "MSLevel", - "Charge", - "MS_peaks", - "Base_Peak_Intensity", - "Summed_Peak_Intensities", - "Retention_Time", - "Exp_Mass_To_Charge", - "AcquisitionDateTime", - ] - - def parse_mzml(file_name: str, file_columns: list, id_only: bool = False): + schema = pa.schema( + [ + pa.field("SpectrumID", pa.string(), nullable=True), + pa.field("MSLevel", pa.float64(), nullable=True), + pa.field("Charge", pa.float64(), nullable=True), + pa.field("MS_peaks", pa.float64(), nullable=True), + pa.field("Base_Peak_Intensity", pa.float64(), nullable=True), + pa.field("Summed_Peak_Intensities", pa.float64(), nullable=True), + pa.field("Retention_Time", pa.float64(), nullable=True), + pa.field("Exp_Mass_To_Charge", pa.float64(), nullable=True), + pa.field("AcquisitionDateTime", pa.string(), nullable=True), + ] + ) + + id_schema = pa.schema( + [ + ("scan", pa.string()), + ("ms_level", pa.int32()), + ("mz", pa.list_(pa.float64())), + ("intensity", pa.list_(pa.float64())), + ] + ) + + def batch_write_mzml_streaming( + file_name: str, + parquet_schema: pa.Schema, + output_path: str, + id_parquet_schema: pa.Schema, + id_only: bool = False, + batch_size: int = 10000, + ) -> Optional[str]: + """ + Parse mzML file in a streaming manner and write to Parquet. + """ + consumer = BatchWritingConsumer( + parquet_schema=parquet_schema, + output_path=output_path, + batch_size=batch_size, + id_only=id_only, + id_parquet_schema=id_parquet_schema, + ) + try: + MzMLFile().transform(file_name.encode(), consumer) + consumer.finalize() + return output_path + except Exception as e: + print(f"Error during streaming: {e}") + return None + + def batch_write_bruker_d(file_name: str, output_path: str, batch_size: int = 10000) -> str: """ - Parse mzML file and return a pandas DataFrame with the information. If id_only is True, it will also save a csv. - @param file_name: The file name of the mzML file - @param file_columns: The columns of the DataFrame - @param id_only: If True, it will save a csv with the spectrum id, mz and intensity - @return: A pandas DataFrame with the information of the mzML file + Batch processing and writing of Bruker .d files. """ + sql_filepath = f"{file_name}/analysis.tdf" - info = [] - psm_part_info = [] - exp = MSExperiment() - acquisition_datetime = exp.getDateTime().get() - MzMLFile().load(file_name, exp) - for spectrum in exp: - id_ = spectrum.getNativeID() - ms_level = spectrum.getMSLevel() - rt = spectrum.getRT() if spectrum.getRT() else None - - peaks_tuple = spectrum.get_peaks() - peak_per_ms = len(peaks_tuple[0]) - - if not spectrum.metaValueExists("base peak intensity"): - bpc = max(peaks_tuple[1]) if len(peaks_tuple[1]) > 0 else None - else: - bpc = spectrum.getMetaValue("base peak intensity") - - if not spectrum.metaValueExists("total ion current"): - tic = sum(peaks_tuple[1]) if len(peaks_tuple[1]) > 0 else None - else: - tic = spectrum.getMetaValue("total ion current") - - if ms_level == 1: - info_list = [ - id_, - ms_level, - None, - peak_per_ms, - bpc, - tic, - rt, - None, - acquisition_datetime, - ] - elif ms_level == 2: - charge_state = spectrum.getPrecursors()[0].getCharge() - emz = ( - spectrum.getPrecursors()[0].getMZ() - if spectrum.getPrecursors()[0].getMZ() - else None - ) - info_list = [ - id_, - ms_level, - charge_state, - peak_per_ms, - bpc, - tic, - rt, - emz, - acquisition_datetime, - ] - mz_array = peaks_tuple[0] - intensity_array = peaks_tuple[1] - else: - info_list = [ - id_, - ms_level, - None, - None, - None, - None, - rt, - None, - acquisition_datetime, - ] + with sqlite3.connect(sql_filepath) as conn: + # Retrieve acquisition datetime + acquisition_date_time = conn.execute( + "SELECT Value FROM GlobalMetadata WHERE key='AcquisitionDateTime'" + ).fetchone()[0] - if id_only and ms_level == 2: - psm_part_info.append( - [ - re.findall(r"[scan|spectrum]=(\d+)", id_)[0], - ms_level, - mz_array, - intensity_array, - ] - ) - info.append(info_list) - - if id_only and len(psm_part_info) > 0: - pd.DataFrame( - psm_part_info, columns=["scan", "ms_level", "mz", "intensity"] - ).to_parquet( - f"{Path(ms_path).stem}_spectrum_df.parquet", - index=False, - compression="gzip", + # Check which optional columns exist + columns = column_exists(conn, "frames") + + # Get allowed columns from the schema + allowed_columns = { + "Id": "Id", + "MsMsType": "CASE WHEN MsMsType IN (8, 9) THEN 2 WHEN MsMsType = 0 THEN 1 ELSE NULL END", + "NumPeaks": "NumPeaks", + "MaxIntensity": "MaxIntensity", + "SummedIntensities": "SummedIntensities", + "Time": "Time", + "Charge": "Charge", + "MonoisotopicMz": "MonoisotopicMz", + } + + # Construct safe column list + safe_columns = [] + column_mapping = {} + for schema_col_name, sql_expr in allowed_columns.items(): + if schema_col_name in columns or schema_col_name == "Id": + safe_columns.append(sql_expr) + column_mapping[schema_col_name] = sql_expr + + # Construct the query using parameterized safe columns + query = f"""SELECT {', '.join(safe_columns)} FROM frames""" + + schema = pa.schema( + [ + pa.field("Id", pa.int32(), nullable=False), + pa.field("MsMsType", pa.int32(), nullable=True), + pa.field("NumPeaks", pa.int32(), nullable=True), + pa.field("MaxIntensity", pa.float64(), nullable=True), + pa.field("SummedIntensities", pa.float64(), nullable=True), + pa.field("Time", pa.float64(), nullable=True), + pa.field("Charge", pa.int32(), nullable=True), + pa.field("MonoisotopicMz", pa.float64(), nullable=True), + pa.field("AcquisitionDateTime", pa.string(), nullable=True), + ] ) - return pd.DataFrame(info, columns=file_columns) + # Set up parquet writer + parquet_writer = pq.ParquetWriter(output_path, schema=schema, compression="gzip") - def parse_bruker_d(file_name: str, file_columns: list): - sql_filepath = f"{file_name}/analysis.tdf" - if not Path(sql_filepath).exists(): - msg = f"File '{sql_filepath}' not found" - raise FileNotFoundError(msg) - conn = sqlite3.connect(sql_filepath) - c = conn.cursor() - - datetime_cmd = ( - "SELECT Value FROM GlobalMetadata WHERE key='AcquisitionDateTime'" - ) - acquisition_date_time = c.execute(datetime_cmd).fetchall()[0][0] + try: + # Stream data in batches + for chunk in pd.read_sql_query(query, conn, chunksize=batch_size): + chunk["AcquisitionDateTime"] = acquisition_date_time + for col in schema.names: + if col not in chunk.columns: + chunk[col] = None + batch_table = pa.Table.from_pandas(chunk, schema=schema) + parquet_writer.write_table(batch_table) - df = pd.read_sql_query( - "SELECT Id, MsMsType, NumPeaks, MaxIntensity, SummedIntensities, Time FROM frames", - conn, - ) - df["AcquisitionDateTime"] = acquisition_date_time + finally: + parquet_writer.close() - # {8:'DDA-PASEF', 9:'DIA-PASEF'} - if 8 in df["MsMsType"].values: - mslevel_map = {0: 1, 8: 2} - elif 9 in df["MsMsType"].values: - mslevel_map = {0: 1, 9: 2} - else: - msg = f"Unrecognized ms type '{df['MsMsType'].values}'" - raise ValueError(msg) - df["MsMsType"] = df["MsMsType"].map(mslevel_map) + return output_path - try: - # This line raises an sqlite error if the table does not exist - _ = conn.execute("SELECT * from Precursors LIMIT 1").fetchall() - precursor_df = pd.read_sql_query("SELECT * from Precursors", conn) - except sqlite3.OperationalError as e: - if "no such table: Precursors" in str(e): - print( - f"No precursors recorded in {file_name}, This is normal for DIA data." - ) - precursor_df = pd.DataFrame() - else: - raise + # Resolve file path + ms_path = _resolve_ms_path(ms_path) + output_path = f"{Path(ms_path).stem}_ms_info.parquet" - if len(df) == len(precursor_df): - df = pd.concat([df, precursor_df["Charge", "MonoisotopicMz"]], axis=1) - df["Charge"] = df["Charge"].fillna(0) - else: - df[["Charge", "Exp_Mass_To_Charge"]] = None, None - - df = df[ - [ - "Id", - "MsMsType", - "Charge", - "NumPeaks", - "MaxIntensity", - "SummedIntensities", - "Time", - "Exp_Mass_To_Charge", - "AcquisitionDateTime", - ] - ] - df.columns = pd.Index(file_columns) - - return df - - if not (Path(ms_path).exists()): - print(f"Not found '{ms_path}', trying to find alias") - ms_path_path = Path(ms_path) - path_stem = str(ms_path_path.stem) - candidates = ( - list(ms_path_path.parent.glob("*.d")) - + list(ms_path_path.parent.glob("*.mzml")) - + list(ms_path_path.parent.glob("*.mzML")) + # Choose processing method based on file type + if Path(ms_path).suffix == ".d": + batch_write_bruker_d(file_name=ms_path, output_path=output_path, batch_size=batch_size) + elif Path(ms_path).suffix.lower() in [".mzml"]: + batch_write_mzml_streaming( + file_name=ms_path, + parquet_schema=schema, + id_parquet_schema=id_schema, + output_path=output_path, + id_only=id_only, + batch_size=batch_size, ) + else: + raise RuntimeError(f"Unsupported file type: {ms_path}") - candidates = [c for c in candidates if path_stem in str(c)] - if len(candidates) == 1: - ms_path = str(candidates[0].resolve()) - else: - raise FileNotFoundError() +def _resolve_ms_path(ms_path: str) -> str: + """ + Resolve mass spectrometry file path with improved candidate search. + """ + path_obj = Path(ms_path) + if path_obj.exists(): + return str(path_obj) - if Path(ms_path).suffix == ".d" and Path(ms_path).is_dir(): - ms_df = parse_bruker_d(ms_path, file_columns) - elif Path(ms_path).suffix in [".mzML", ".mzml"]: - ms_df = parse_mzml(ms_path, file_columns, id_only) - else: - msg = f"Unrecognized or the mass spec file '{ms_path}' do not exist" - raise RuntimeError(msg) - - ms_df.to_parquet( - f"{Path(ms_path).stem}_ms_info.parquet", - engine="pyarrow", - index=False, - compression="gzip", - ) + candidates = list(path_obj.parent.glob(f"{path_obj.stem}*")) + valid_extensions = {".d", ".mzml", ".mzML"} + candidates = [str(c.resolve()) for c in candidates if c.suffix.lower() in valid_extensions] + + if len(candidates) == 1: + return candidates[0] + + raise FileNotFoundError(f"No unique file found for {ms_path}") diff --git a/quantmsutils/psm/psm_conversion.py b/quantmsutils/psm/psm_conversion.py index 232f3af..9b06705 100644 --- a/quantmsutils/psm/psm_conversion.py +++ b/quantmsutils/psm/psm_conversion.py @@ -141,6 +141,8 @@ def convert_psm( if hit.metaValueExists("MS:1001491"): global_qvalue = hit.getMetaValue("MS:1001491") + elif hit.metaValueExists("q-value"): + global_qvalue = hit.getMetaValue("q-value") charge = hit.getCharge() peptidoform = hit.getSequence().toString() diff --git a/quantmsutils/rescoring/ms2rescore.py b/quantmsutils/rescoring/ms2rescore.py index 030c59c..4c46809 100644 --- a/quantmsutils/rescoring/ms2rescore.py +++ b/quantmsutils/rescoring/ms2rescore.py @@ -5,17 +5,130 @@ import importlib.resources import json import logging -from typing import List import click import pyopenms as oms from ms2rescore import package_data, rescore from psm_utils import PSMList from psm_utils.io.idxml import IdXMLReader, IdXMLWriter +from typing import Iterable, List, Union +from pathlib import Path +from psm_utils.psm import PSM logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s") +class IDXMLReaderPatch(IdXMLReader): + def __init__(self, filename: Union[Path, str], *args, **kwargs) -> None: + """ + Patch Reader for idXML files based on IDXMLReader. + + Parameters + ---------- + filename: str, pathlib.Path + Path to idXML file. + + Examples + -------- + """ + super().__init__(filename, *args, **kwargs) + self.protein_ids, self.peptide_ids = self._parse_idxml() + self.user_params_metadata = self._get_userparams_metadata(self.peptide_ids[0].getHits()[0]) + self.rescoring_features = self._get_rescoring_features(self.peptide_ids[0].getHits()[0]) + self.skip_invalid_psm = 0 + + def __iter__(self) -> Iterable[PSM]: + """ + Iterate over file and return PSMs one-by-one. + Test cases will: + + Input PSM 1: PeptideHit with metavalue + "MSGF:ScoreRatio" value="0.212121212121212"/> + "MSGF:Energy" value="130.0"/> + "MSGF:lnEValue" value="-3.603969939390662"/> + "MSGF:lnExplainedIonCurrentRatio" value="-0.881402756873971"/> + "MSGF:lnNTermIonCurrentRatio" value="-1.931878317286471"/> + "MSGF:lnCTermIonCurrentRatio" value="-1.311462733724937"/> + "MSGF:lnMS2IonCurrent" value="9.702930189540499"/> + "MSGF:MeanErrorTop7" value="259.986879999999985"/> + "MSGF:sqMeanErrorTop7" value="6.75931777721344e04"/> + "MSGF:StdevErrorTop7" value="143.678020000000004"/> + PSM2: PeptideHit No above metaValue + + Run: + reader = IDXMLReaderPatch(input_file) + psm_list = reader.read_file() + + psm_list: return [PSM 1] + + """ + for peptide_id in self.peptide_ids: + for peptide_hit in peptide_id.getHits(): + psm = self._parse_psm(self.protein_ids, peptide_id, peptide_hit) + if psm is not None: + yield psm + else: + self.skip_invalid_psm += 1 + + def _parse_psm( + self, + protein_ids: oms.ProteinIdentification, + peptide_id: oms.PeptideIdentification, + peptide_hit: oms.PeptideHit, + ) -> PSM: + """ + Parse idXML :py:class:`~pyopenms.PeptideHit` to :py:class:`~psm_utils.psm.PSM`. + + Uses additional information from :py:class:`~pyopenms.ProteinIdentification` and + :py:class:`~pyopenms.PeptideIdentification` to annotate parameters of the + :py:class:`~psm_utils.psm.PSM` object. + """ + peptidoform = self._parse_peptidoform( + peptide_hit.getSequence().toString(), peptide_hit.getCharge() + ) + # This is needed to calculate a qvalue before rescoring the PSMList + peptide_id_metadata = { + "idxml:score_type": str(peptide_id.getScoreType()), + "idxml:higher_score_better": str(peptide_id.isHigherScoreBetter()), + "idxml:significance_threshold": str(peptide_id.getSignificanceThreshold()), + } + peptide_hit_metadata = { + key: peptide_hit.getMetaValue(key) for key in self.user_params_metadata + } + + # Get search engines score features and check valueExits + rescoring_features = {} + for key in self.rescoring_features: + feature = peptide_hit.metaValueExists(key) + if not feature: + return None + else: + rescoring_features[key] = float(peptide_hit.getMetaValue(key)) + + return PSM( + peptidoform=peptidoform, + spectrum_id=peptide_id.getMetaValue("spectrum_reference"), + run=self._get_run(protein_ids, peptide_id), + is_decoy=self._is_decoy(peptide_hit), + score=peptide_hit.getScore(), + precursor_mz=peptide_id.getMZ(), + retention_time=peptide_id.getRT(), + # NOTE: ion mobility will be supported by OpenMS in the future + protein_list=[ + accession.decode() for accession in peptide_hit.extractProteinAccessionsSet() + ], + rank=peptide_hit.getRank() + 1, # 0-based to 1-based + source="idXML", + # Storing proforma notation of peptidoform and UNIMOD peptide sequence for mapping back + # to original sequence in writer + provenance_data={str(peptidoform): peptide_hit.getSequence().toString()}, + # Store metadata of PeptideIdentification and PeptideHit objects + metadata={**peptide_id_metadata, **peptide_hit_metadata}, + + rescoring_features=rescoring_features, + ) + + def parse_cli_arguments_to_config( config_file: str = None, feature_generators: str = None, @@ -119,9 +232,14 @@ def parse_cli_arguments_to_config( def rescore_idxml(input_file, output_file, config) -> None: """Rescore PSMs in an idXML file and keep other information unchanged.""" # Read PSMs - reader = IdXMLReader(input_file) + reader = IDXMLReaderPatch(input_file) psm_list = reader.read_file() + if reader.skip_invalid_psm != 0: + logging.warning( + f"Removed {reader.skip_invalid_psm} PSMs without search engine features!" + ) + # Rescore rescore(config, psm_list) diff --git a/recipe/meta.yaml b/recipe/meta.yaml index 2b3fb7f..6b076ba 100644 --- a/recipe/meta.yaml +++ b/recipe/meta.yaml @@ -1,7 +1,7 @@ # recipe/meta.yaml package: name: quantms-utils - version: "0.0.15" + version: "0.0.16" source: path: ../