From c35056083516c2c522d91a5f7fc1e21d9d32faaf Mon Sep 17 00:00:00 2001 From: "Tim Maier (Ubuntu Desktop)" Date: Wed, 25 Oct 2023 15:06:48 +0200 Subject: [PATCH 1/7] transfer simulation framework from proteolizard --- imspy/imspy/data.py | 4 - imspy/imspy/data/__init__.py | 4 + imspy/imspy/data/frame.py | 391 +++++++++ imspy/imspy/data/handle.py | 157 ++++ imspy/imspy/data/slice.py | 394 +++++++++ imspy/imspy/data/spectrum.py | 377 +++++++++ imspy/imspy/feature.py | 211 +++++ imspy/imspy/handle.py | 5 + imspy/imspy/isotopes.py | 242 ++++++ imspy/imspy/noise.py | 124 +++ imspy/imspy/proteome.py | 322 ++++++++ imspy/imspy/simulation/experiment.py | 204 +++++ imspy/imspy/simulation/hardware_models.py | 766 ++++++++++++++++++ imspy/imspy/slice.py | 6 + imspy/imspy/utility.py | 508 ++++++++++++ .../simulation/run_example_simulation.py | 104 +++ pyims/pyims/chemistry.py | 213 +++++ 17 files changed, 4028 insertions(+), 4 deletions(-) delete mode 100644 imspy/imspy/data.py create mode 100644 imspy/imspy/data/__init__.py create mode 100644 imspy/imspy/data/frame.py create mode 100644 imspy/imspy/data/handle.py create mode 100644 imspy/imspy/data/slice.py create mode 100644 imspy/imspy/data/spectrum.py create mode 100644 imspy/imspy/feature.py create mode 100644 imspy/imspy/isotopes.py create mode 100644 imspy/imspy/noise.py create mode 100644 imspy/imspy/proteome.py create mode 100644 imspy/imspy/simulation/experiment.py create mode 100644 imspy/imspy/simulation/hardware_models.py create mode 100644 imspy/imspy/utility.py create mode 100644 pyims/examples/simulation/run_example_simulation.py create mode 100644 pyims/pyims/chemistry.py diff --git a/imspy/imspy/data.py b/imspy/imspy/data.py deleted file mode 100644 index d2674bc8..00000000 --- a/imspy/imspy/data.py +++ /dev/null @@ -1,4 +0,0 @@ -from .frame import TimsFrame -from .spectrum import TimsSpectrum, MzSpectrum -from .handle import TimsDataset, TimsDatasetDDA, TimsDatasetDIA -from .slice import TimsSlice, TimsSliceVectorized diff --git a/imspy/imspy/data/__init__.py b/imspy/imspy/data/__init__.py new file mode 100644 index 00000000..1809a4f5 --- /dev/null +++ b/imspy/imspy/data/__init__.py @@ -0,0 +1,4 @@ +from pyims.data.frame import TimsFrame +from pyims.data.spectrum import TimsSpectrum, MzSpectrum +from pyims.data.handle import TimsDataset, TimsDatasetDDA, TimsDatasetDIA +from pyims.data.slice import TimsSlice diff --git a/imspy/imspy/data/frame.py b/imspy/imspy/data/frame.py new file mode 100644 index 00000000..659209b0 --- /dev/null +++ b/imspy/imspy/data/frame.py @@ -0,0 +1,391 @@ +import pandas as pd + +from typing import List +from numpy.typing import NDArray + +from tensorflow import sparse as sp + +import numpy as np +import imspy_connector as pims +from imspy.spectrum import MzSpectrum, TimsSpectrum, IndexedMzSpectrum + +from imspy.utilities import re_index_indices + + +class TimsFrame: + def __init__(self, frame_id: int, ms_type: int, retention_time: float, scan: NDArray[np.int32], + mobility: NDArray[np.float64], tof: NDArray[np.int32], + mz: NDArray[np.float64], intensity: NDArray[np.float64]): + """TimsFrame class. + + Args: + frame_id (int): Frame ID. + ms_type (int): MS type. + retention_time (float): Retention time. + scan (NDArray[np.int32]): Scan. + mobility (NDArray[np.float64]): Inverse mobility. + tof (NDArray[np.int32]): Time of flight. + mz (NDArray[np.float64]): m/z. + intensity (NDArray[np.float64]): Intensity. + + Raises: + AssertionError: If the length of the scan, mobility, tof, mz and intensity arrays are not equal. + """ + + assert len(scan) == len(mobility) == len(tof) == len(mz) == len(intensity), \ + "The length of the scan, mobility, tof, mz and intensity arrays must be equal." + + self.__frame_ptr = pims.PyTimsFrame(frame_id, ms_type, retention_time, scan, mobility, tof, mz, intensity) + + @classmethod + def from_py_tims_frame(cls, frame: pims.PyTimsFrame): + """Create a TimsFrame from a PyTimsFrame. + + Args: + frame (pims.PyTimsFrame): PyTimsFrame to create the TimsFrame from. + + Returns: + TimsFrame: TimsFrame created from the PyTimsFrame. + """ + instance = cls.__new__(cls) + instance.__frame_ptr = frame + return instance + + @property + def frame_id(self) -> int: + """Frame ID. + + Returns: + int: Frame ID. + """ + return self.__frame_ptr.frame_id + + @property + def ms_type(self) -> str: + """MS type. + + Returns: + int: MS type. + """ + return self.__frame_ptr.ms_type_as_string + + @property + def retention_time(self) -> float: + """Retention time. + + Returns: + float: Retention time. + """ + return self.__frame_ptr.retention_time + + @property + def scan(self) -> NDArray[np.int32]: + """Scan. + + Returns: + NDArray[np.int32]: Scan. + """ + return self.__frame_ptr.scan + + @property + def mobility(self) -> NDArray[np.float64]: + """Inverse mobility. + + Returns: + NDArray[np.float64]: Inverse mobility. + """ + return self.__frame_ptr.mobility + + @property + def tof(self) -> NDArray[np.int32]: + """Time of flight. + + Returns: + NDArray[np.int32]: Time of flight. + """ + return self.__frame_ptr.tof + + @property + def mz(self) -> NDArray[np.float64]: + """m/z. + + Returns: + NDArray[np.float64]: m/z. + """ + return self.__frame_ptr.mz + + @property + def intensity(self) -> NDArray[np.float64]: + """Intensity. + + Returns: + NDArray[np.float64]: Intensity. + """ + return self.__frame_ptr.intensity + + @property + def df(self) -> pd.DataFrame: + """ Data as a pandas DataFrame. + + Returns: + pd.DataFrame: Data. + """ + + return pd.DataFrame({ + 'frame': np.repeat(self.frame_id, len(self.scan)), + 'retention_time': np.repeat(self.retention_time, len(self.scan)), + 'scan': self.scan, + 'mobility': self.mobility, + 'tof': self.tof, + 'mz': self.mz, + 'intensity': self.intensity}) + + def filter(self, + mz_min: float = 0.0, + mz_max: float = 2000.0, + scan_min: int = 0, + scan_max: int = 1000, + mobility_min: float = 0.0, + mobility_max: float = 2.0, + intensity_min: float = 0.0, + intensity_max: float = 1e9, + ) -> 'TimsFrame': + """Filter the frame for a given m/z range, scan range and intensity range. + + Args: + mz_min (float): Minimum m/z value. + mz_max (float): Maximum m/z value. + scan_min (int, optional): Minimum scan value. Defaults to 0. + scan_max (int, optional): Maximum scan value. Defaults to 1000. + mobility_min (float, optional): Minimum inverse mobility value. Defaults to 0.0. + mobility_max (float, optional): Maximum inverse mobility value. Defaults to 2.0. + intensity_min (float, optional): Minimum intensity value. Defaults to 0.0. + intensity_max (float, optional): Maximum intensity value. Defaults to 1e9. + + Returns: + TimsFrame: Filtered frame. + """ + + return TimsFrame.from_py_tims_frame(self.__frame_ptr.filter_ranged(mz_min, mz_max, scan_min, scan_max, mobility_min, mobility_max, + intensity_min, intensity_max)) + + def to_indexed_mz_spectrum(self) -> 'IndexedMzSpectrum': + """Convert the frame to an IndexedMzSpectrum. + + Returns: + IndexedMzSpectrum: IndexedMzSpectrum. + """ + return IndexedMzSpectrum.from_py_indexed_mz_spectrum(self.__frame_ptr.to_indexed_mz_spectrum()) + + def to_resolution(self, resolution: int) -> 'TimsFrame': + """Convert the frame to a given resolution. + + Args: + resolution (int): Resolution. + + Returns: + TimsFrame: Frame with the given resolution. + """ + return TimsFrame.from_py_tims_frame(self.__frame_ptr.to_resolution(resolution)) + + def vectorized(self, resolution: int = 2) -> 'TimsFrameVectorized': + """Convert the frame to a vectorized frame. + + Args: + resolution (int, optional): Resolution. Defaults to 2. + + Returns: + TimsFrameVectorized: Vectorized frame. + """ + return TimsFrameVectorized.from_py_tims_frame_vectorized(self.__frame_ptr.vectorized(resolution)) + + def to_tims_spectra(self) -> List['TimsSpectrum']: + """Convert the frame to a list of TimsSpectrum. + + Returns: + List[TimsSpectrum]: List of TimsSpectrum. + """ + return [TimsSpectrum.from_py_tims_spectrum(spec) for spec in self.__frame_ptr.to_tims_spectra()] + + def to_windows(self, window_length: float = 10, overlapping: bool = True, min_num_peaks: int = 5, + min_intensity: float = 1) -> List[MzSpectrum]: + """Convert the frame to a list of windows. + + Args: + window_length (float, optional): Window length. Defaults to 10. + overlapping (bool, optional): Whether the windows should overlap. Defaults to True. + min_num_peaks (int, optional): Minimum number of peaks in a window. Defaults to 5. + min_intensity (float, optional): Minimum intensity of a peak in a window. Defaults to 1. + + Returns: + List[MzSpectrum]: List of windows. + """ + return [MzSpectrum.from_py_mz_spectrum(spec) for spec in self.__frame_ptr.to_windows( + window_length, overlapping, min_num_peaks, min_intensity)] + + def __repr__(self): + return (f"TimsFrame(frame_id={self.__frame_ptr.frame_id}, ms_type={self.__frame_ptr.ms_type}, " + f"num_peaks={len(self.__frame_ptr.mz)})") + + +class TimsFrameVectorized: + def __init__(self, frame_id: int, ms_type: int, retention_time: float, scan: NDArray[np.int32], + mobility: NDArray[np.float64], tof: NDArray[np.int32], + indices: NDArray[np.int32], intensity: NDArray[np.float64]): + """TimsFrameVectorized class. + + Args: + frame_id (int): Frame ID. + ms_type (int): MS type. + retention_time (float): Retention time. + scan (NDArray[np.int32]): Scan. + mobility (NDArray[np.float64]): Inverse mobility. + tof (NDArray[np.int32]): Time of flight. + indices (NDArray[np.int32]): Indices. + intensity (NDArray[np.float64]): Intensity. + + Raises: + AssertionError: If the length of the scan, mobility, tof, indices and intensity arrays are not equal. + """ + + assert len(scan) == len(mobility) == len(tof) == len(indices) == len(intensity), \ + "The length of the scan, mobility, tof, indices and intensity arrays must be equal." + + self.__frame_ptr = pims.PyTimsFrameVectorized(frame_id, ms_type, retention_time, scan, mobility, tof, indices, + intensity) + + @classmethod + def from_py_tims_frame_vectorized(cls, frame: pims.PyTimsFrameVectorized): + """Create a TimsFrameVectorized from a PyTimsFrameVectorized. + + Args: + frame (pims.PyTimsFrameVectorized): PyTimsFrameVectorized to create the TimsFrameVectorized from. + + Returns: + TimsFrameVectorized: TimsFrameVectorized created from the PyTimsFrameVectorized. + """ + instance = cls.__new__(cls) + instance.__frame_ptr = frame + return instance + + @property + def frame_id(self) -> int: + """Frame ID. + + Returns: + int: Frame ID. + """ + return self.__frame_ptr.frame_id + + @property + def ms_type(self) -> str: + """MS type. + + Returns: + int: MS type. + """ + return self.__frame_ptr.ms_type_as_string + + @property + def retention_time(self) -> float: + """Retention time. + + Returns: + float: Retention time. + """ + return self.__frame_ptr.retention_time + + @property + def scan(self) -> NDArray[np.int32]: + """Scan. + + Returns: + NDArray[np.int32]: Scan. + """ + return self.__frame_ptr.scan + + @property + def mobility(self) -> NDArray[np.float64]: + """Inverse mobility. + + Returns: + NDArray[np.float64]: Inverse mobility. + """ + return self.__frame_ptr.mobility + + @property + def tof(self) -> NDArray[np.int32]: + """Time of flight. + + Returns: + NDArray[np.int32]: Time of flight. + """ + return self.__frame_ptr.tof + + @property + def indices(self) -> NDArray[np.int32]: + """Indices. + + Returns: + NDArray[np.int32]: Indices. + """ + return self.__frame_ptr.indices + + @property + def intensity(self) -> NDArray[np.float64]: + """Intensity. + + Returns: + NDArray[np.float64]: Intensity. + """ + return self.__frame_ptr.values + + @property + def df(self) -> pd.DataFrame: + """ Data as a pandas DataFrame. + + Returns: + pd.DataFrame: Data. + """ + + return pd.DataFrame({ + 'frame': np.repeat(self.frame_id, len(self.scan)), + 'retention_time': np.repeat(self.retention_time, len(self.scan)), + 'scan': self.scan, + 'mobility': self.mobility, + 'tof': self.tof, + 'indices': self.indices, + 'intensity': self.intensity}) + + def __repr__(self): + return (f"TimsFrameVectorized(frame_id={self.__frame_ptr.frame_id}, ms_type={self.__frame_ptr.ms_type}, " + f"num_peaks={len(self.__frame_ptr.indices)})") + + def get_tensor_repr(self, dense=True, zero_indexed=True, re_index=True, scan_max=None, index_max=None): + s = self.scan + f = self.indices + i = self.intensity + + if zero_indexed: + s = s - np.min(s) + f = f - np.min(f) + + if re_index: + f = re_index_indices(f) + + if scan_max is None: + m_s = np.max(s) + 1 + else: + m_s = scan_max + 1 + + if index_max is None: + m_f = np.max(f) + 1 + else: + m_f = index_max + 1 + + sv = sp.reorder(sp.SparseTensor(indices=np.c_[s, f], values=i, dense_shape=(m_s, m_f))) + + if dense: + return sp.to_dense(sv) + else: + return sv diff --git a/imspy/imspy/data/handle.py b/imspy/imspy/data/handle.py new file mode 100644 index 00000000..4aec5145 --- /dev/null +++ b/imspy/imspy/data/handle.py @@ -0,0 +1,157 @@ +from typing import List + +import numpy as np +import pandas as pd +import sqlite3 +from numpy.typing import NDArray + +import imspy_connector as pims +import opentims_bruker_bridge as obb + +from abc import ABC + +<<<<<<<< HEAD:imspy/imspy/handle.py +from imspy.frame import TimsFrame +from imspy.slice import TimsSlice +======== +from pyims.data import TimsFrame, TimsSlice + +>>>>>>>> 1cd02c1 (transfer simulation framework from proteolizard):imspy/imspy/data/handle.py + + +class TimsDataset(ABC): + def __init__(self, data_path: str): + """TimsDataHandle class. + + Args: + data_path (str): Path to the data. + """ + self.data_path = data_path + self.bp: List[str] = obb.get_so_paths() + self.meta_data = self.__load_meta_data() + self.precursor_frames = self.meta_data[self.meta_data["MsMsType"] == 0].Id.values.astype(np.int32) + self.fragment_frames = self.meta_data[self.meta_data["MsMsType"] > 0].Id.values.astype(np.int32) + self.__handle = None + self.__current_index = 1 + + # Try to load the data with the first binary found + appropriate_found = False + for so_path in self.bp: + try: + self.__handle = pims.PyTimsDataHandle(self.data_path, so_path) + appropriate_found = True + break + except Exception: + continue + assert appropriate_found is True, ("No appropriate bruker binary could be found, please check if your " + "operating system is supported by open-tims-bruker-bridge.") + + @property + def acquisition_mode(self) -> str: + """Get the acquisition mode. + + Returns: + str: Acquisition mode. + """ + return self.__handle.get_acquisition_mode_as_string() + + @property + def acquisition_mode_numerical(self) -> int: + """Get the acquisition mode as a numerical value. + + Returns: + int: Acquisition mode as a numerical value. + """ + return self.__handle.get_acquisition_mode() + + @property + def frame_count(self) -> int: + """Get the number of frames. + + Returns: + int: Number of frames. + """ + return self.__handle.frame_count + + def __load_meta_data(self) -> pd.DataFrame: + """Get the meta data. + + Returns: + pd.DataFrame: Meta data. + """ + return pd.read_sql_query("SELECT * from Frames", sqlite3.connect(self.data_path + "/analysis.tdf")) + + def get_tims_frame(self, frame_id: int) -> TimsFrame: + """Get a TimsFrame. + + Args: + frame_id (int): Frame ID. + + Returns: + TimsFrame: TimsFrame. + """ + return TimsFrame.from_py_tims_frame(self.__handle.get_tims_frame(frame_id)) + + def get_tims_slice(self, frame_ids: NDArray[np.int32]) -> TimsSlice: + """Get a TimsFrame. + + Args: + frame_ids (int): Frame ID. + + Returns: + TimsFrame: TimsFrame. + """ + return TimsSlice.from_py_tims_slice(self.__handle.get_tims_slice(frame_ids)) + + def __iter__(self): + return self + + def __next__(self): + if self.__current_index <= self.frame_count: + frame_ptr = self.__handle.get_tims_frame(self.__current_index) + self.__current_index += 1 + if frame_ptr is not None: + return TimsFrame.from_py_tims_frame(frame_ptr) + else: + raise ValueError(f"Frame pointer is None for valid index: {self.__current_index}") + else: + self.__current_index = 1 # Reset for next iteration + raise StopIteration + + def __getitem__(self, index): + if isinstance(index, slice): + return self.get_tims_slice(np.arange(index.start, index.stop, index.step).astype(np.int32)) + return self.get_tims_frame(index) + + +class TimsDatasetDDA(TimsDataset): + @property + def selected_precursors(self): + """Get precursors selected for fragmentation. + + Returns: + pd.DataFrame: Precursors selected for fragmentation. + """ + return pd.read_sql_query("SELECT * from Precursors", sqlite3.connect(self.data_path + "/analysis.tdf")) + + @property + def pasef_meta_data(self): + """Get PASEF meta data for DDA. + + Returns: + pd.DataFrame: PASEF meta data. + """ + return pd.read_sql_query("SELECT * from PasefFrameMsMsInfo", + sqlite3.connect(self.data_path + "/analysis.tdf")) + + +class TimsDatasetDIA(TimsDataset): + @property + def pasef_meta_data(self): + """Get PASEF meta data for DIA. + + Returns: + pd.DataFrame: PASEF meta data. + """ + return pd.read_sql_query("SELECT * from DiaFrameMsMsWindows", + sqlite3.connect(self.data_path + "/analysis.tdf")) diff --git a/imspy/imspy/data/slice.py b/imspy/imspy/data/slice.py new file mode 100644 index 00000000..056febaa --- /dev/null +++ b/imspy/imspy/data/slice.py @@ -0,0 +1,394 @@ +import numpy as np +import pandas as pd +from typing import List + +from numpy.typing import NDArray +from tensorflow import sparse as sp + +from imspy.utilities import re_index_indices + +<<<<<<<< HEAD:imspy/imspy/slice.py +import imspy_connector as pims +from imspy.frame import TimsFrame, TimsFrameVectorized +from imspy.spectrum import MzSpectrum +======== +import pyims_connector as pims +from pyims.data.frame import TimsFrame, TimsFrameVectorized +from pyims.data.spectrum import MzSpectrum +>>>>>>>> 1cd02c1 (transfer simulation framework from proteolizard):imspy/imspy/data/slice.py + + +class TimsSlice: + def __init__(self, + frame_id: NDArray[np.int32], + scan: NDArray[np.int32], + tof: NDArray[np.int32], + retention_time: NDArray[np.float64], + mobility: NDArray[np.float64], + mz: NDArray[np.float64], + intensity: NDArray[np.float64]): + """Create a TimsSlice. + + Args: + frame_id (NDArray[np.int32]): Frame ID. + scan (NDArray[np.int32]): Scan. + tof (NDArray[np.int32]): TOF. + retention_time (NDArray[np.float64]): Retention time. + mobility (NDArray[np.float64]): Mobility. + mz (NDArray[np.float64]): m/z. + intensity (NDArray[np.float64]): Intensity. + """ + + assert len(frame_id) == len(scan) == len(tof) == len(retention_time) == len(mobility) == len(mz) == len( + intensity), "All arrays must have the same length." + + self.__slice_ptr = pims.PyTimsSlice( + frame_id, scan, tof, retention_time, mobility, mz, intensity + ) + self.__current_index = 0 + + @classmethod + def from_py_tims_slice(cls, tims_slice: pims.PyTimsSlice): + """Create a TimsSlice from a PyTimsSlice. + + Args: + tims_slice (pims.PyTimsSlice): PyTimsSlice to create the TimsSlice from. + + Returns: + TimsSlice: TimsSlice created from the PyTimsSlice. + """ + instance = cls.__new__(cls) + instance.__slice_ptr = tims_slice + instance.__current_index = 0 + return instance + + @property + def first_frame_id(self) -> int: + """First frame ID. + + Returns: + int: First frame ID. + """ + return self.__slice_ptr.first_frame_id + + @property + def last_frame_id(self) -> int: + """Last frame ID. + + Returns: + int: Last frame ID. + """ + return self.__slice_ptr.last_frame_id + + def __repr__(self): + return f"TimsSlice({self.first_frame_id}, {self.last_frame_id})" + + @property + def precursors(self): + return TimsSlice.from_py_tims_slice(self.__slice_ptr.get_precursors()) + + @property + def fragments(self): + return TimsSlice.from_py_tims_slice(self.__slice_ptr.get_fragments_dda()) + + def filter(self, mz_min: float = 0.0, mz_max: float = 2000.0, scan_min: int = 0, scan_max: int = 1000, + mobility_min: float = 0.0, + mobility_max: float = 2.0, + intensity_min: float = 0.0, intensity_max: float = 1e9, num_threads: int = 4) -> 'TimsSlice': + """Filter the slice by m/z, scan and intensity. + + Args: + mz_min (float): Minimum m/z value. + mz_max (float): Maximum m/z value. + scan_min (int, optional): Minimum scan value. Defaults to 0. + scan_max (int, optional): Maximum scan value. Defaults to 1000. + mobility_min (float, optional): Minimum inverse mobility value. Defaults to 0.0. + mobility_max (float, optional): Maximum inverse mobility value. Defaults to 2.0. + intensity_min (float, optional): Minimum intensity value. Defaults to 0.0. + intensity_max (float, optional): Maximum intensity value. Defaults to 1e9. + num_threads (int, optional): Number of threads to use. Defaults to 4. + + Returns: + TimsSlice: Filtered slice. + """ + return TimsSlice.from_py_tims_slice( + self.__slice_ptr.filter_ranged(mz_min, mz_max, scan_min, scan_max, mobility_min, mobility_max, + intensity_min, intensity_max, num_threads)) + + @property + def frames(self) -> List[TimsFrame]: + """Get the frames. + + Returns: + List[TimsFrame]: Frames. + """ + return [TimsFrame.from_py_tims_frame(frame) for frame in self.__slice_ptr.get_frames()] + + def to_resolution(self, resolution: int, num_threads: int = 4) -> 'TimsSlice': + """Convert the slice to a given resolution. + + Args: + resolution (int): Resolution. + num_threads (int, optional): Number of threads to use. Defaults to 4. + + Returns: + TimsSlice: Slice with given resolution. + """ + return TimsSlice.from_py_tims_slice(self.__slice_ptr.to_resolution(resolution, num_threads)) + + def to_windows(self, window_length: float = 10, overlapping: bool = True, min_num_peaks: int = 5, + min_intensity: float = 1, num_threads: int = 4) -> List[MzSpectrum]: + """Convert the slice to a list of windows. + + Args: + window_length (float, optional): Window length. Defaults to 10. + overlapping (bool, optional): Whether the windows should overlap. Defaults to True. + min_num_peaks (int, optional): Minimum number of peaks in a window. Defaults to 5. + min_intensity (float, optional): Minimum intensity of a peak in a window. Defaults to 1. + num_threads (int, optional): Number of threads to use. Defaults to 1. + + Returns: + List[MzSpectrum]: List of windows. + """ + return [MzSpectrum.from_py_mz_spectrum(spec) for spec in self.__slice_ptr.to_windows( + window_length, overlapping, min_num_peaks, min_intensity, num_threads)] + + @property + def df(self) -> pd.DataFrame: + """Get the data as a pandas DataFrame. + + Returns: + pd.DataFrame: Data. + """ + columns = ['frame', 'scan', 'tof', 'retention_time', 'mobility', 'mz', 'intensity'] + return pd.DataFrame({c: v for c, v in zip(columns, self.__slice_ptr.to_arrays())}) + + def __iter__(self): + return self + + def __next__(self): + if self.__current_index < self.__slice_ptr.frame_count: + frame_ptr = self.__slice_ptr.get_frame_at_index(self.__current_index) + self.__current_index += 1 + if frame_ptr is not None: + return TimsFrame.from_py_tims_frame(frame_ptr) + else: + raise ValueError("Frame pointer is None for valid index.") + else: + self.__current_index = 0 # Reset for next iteration + raise StopIteration + + def vectorized(self, resolution: int = 2, num_threads: int = 4) -> 'TimsSliceVectorized': + """Get a vectorized version of the slice. + + Args: + resolution (int, optional): Resolution. Defaults to 2. + num_threads (int, optional): Number of threads to use. Defaults to 4. + + Returns: + TimsSliceVectorized: Vectorized version of the slice. + """ + return TimsSliceVectorized.from_vectorized_py_tims_slice(self.__slice_ptr.vectorized(resolution, num_threads)) + + def get_tims_planes(self, tof_max_value: int = 400_000, num_chunks: int = 7, num_threads: int = 4) -> List[ + 'TimsPlane']: + return [TimsPlane.from_py_tims_plane(plane) for plane in + self.__slice_ptr.to_tims_planes(tof_max_value, num_chunks, num_threads)] + + +class TimsSliceVectorized: + def __init__(self): + self.__slice_ptr = None + self.__current_index = 0 + + @classmethod + def from_vectorized_py_tims_slice(cls, tims_slice: pims.PyTimsSliceVectorized): + """Create a TimsSlice from a PyTimsSlice. + + Args: + tims_slice (pims.PyTimsSlice): PyTimsSlice to create the TimsSlice from. + + Returns: + TimsSlice: TimsSlice created from the PyTimsSlice. + """ + instance = cls.__new__(cls) + instance.__slice_ptr = tims_slice + instance.__current_index = 0 + return instance + + @property + def first_frame_id(self) -> int: + """First frame ID. + + Returns: + int: First frame ID. + """ + return self.__slice_ptr.first_frame_id + + @property + def last_frame_id(self) -> int: + """Last frame ID. + + Returns: + int: Last frame ID. + """ + return self.__slice_ptr.last_frame_id + + @property + def precursors(self): + return TimsSlice.from_py_tims_slice(self.__slice_ptr.get_precursors()) + + @property + def fragments(self): + return TimsSlice.from_py_tims_slice(self.__slice_ptr.get_fragments_dda()) + + @property + def frames(self) -> List[TimsFrameVectorized]: + """Get the frames. + + Returns: + List[TimsFrame]: Frames. + """ + return [TimsFrameVectorized.from_py_tims_frame_vectorized(frame) for frame in + self.__slice_ptr.get_vectorized_frames()] + + @property + def df(self) -> pd.DataFrame: + """Get the data as a pandas DataFrame. + + Returns: + pd.DataFrame: Data. + """ + columns = ['frame', 'scan', 'tof', 'retention_time', 'mobility', 'index', 'intensity'] + return pd.DataFrame({c: v for c, v in zip(columns, self.__slice_ptr.to_arrays())}) + + def __iter__(self): + return self + + def __next__(self): + if self.__current_index < self.__slice_ptr.frame_count: + frame_ptr = self.__slice_ptr.get_frame_at_index(self.__current_index) + self.__current_index += 1 + if frame_ptr is not None: + return TimsFrameVectorized.from_py_tims_frame_vectorized(frame_ptr) + else: + raise ValueError("Frame pointer is None for valid index.") + else: + self.__current_index = 0 + raise StopIteration + + def __repr__(self): + return f"TimsSliceVectorized({self.first_frame_id}, {self.last_frame_id})" + + def get_tensor_repr(self, dense=True, zero_index=True, re_index=True, frame_max=None, scan_max=None, + index_max=None): + + frames, scans, _, _, _, indices, intensities = self.__slice_ptr.to_arrays() + + if zero_index: + scans = scans - np.min(scans) + frames = frames - np.min(frames) + indices = indices - np.min(indices) + + if re_index: + frames = re_index_indices(frames) + + if scan_max is None: + m_s = np.max(scans) + 1 + else: + m_s = scan_max + 1 + + if index_max is None: + m_i = np.max(indices) + 1 + else: + m_i = index_max + 1 + + if frame_max is None: + m_f = np.max(frames) + 1 + else: + m_f = frame_max + 1 + + sv = sp.reorder( + sp.SparseTensor(indices=np.c_[frames, scans, indices], values=intensities, dense_shape=(m_f, m_s, m_i))) + + if dense: + return sp.to_dense(sv) + else: + return sv + + +class TimsPlane: + def __init__(self): + self.__plane_ptr = None + + @classmethod + def from_py_tims_plane(cls, plane: pims.PyTimsPlane): + """Create a TimsPlane from a PyTimsPlane. + + Args: + plane (pims.PyTimsPlane): PyTimsPlane to create the TimsPlane from. + + Returns: + TimsPlane: TimsPlane created from the PyTimsPlane. + """ + instance = cls.__new__(cls) + instance.__plane_ptr = plane + return instance + + @property + def mz_mean(self): + return self.__plane_ptr.mz_mean + + @property + def mz_std(self): + return self.__plane_ptr.mz_std + + @property + def tof_mean(self): + return self.__plane_ptr.tof_mean + + @property + def tof_std(self): + return self.__plane_ptr.tof_std + + @property + def frame_ids(self): + return self.__plane_ptr.frame_ids + + @property + def scans(self): + return self.__plane_ptr.scans + + @property + def intensities(self): + return self.__plane_ptr.intensity + + @property + def retention_times(self): + return self.__plane_ptr.retention_times + + @property + def mobilities(self): + return self.__plane_ptr.mobilities + + @property + def num_points(self): + return len(self.frame_ids) + + @property + def df(self): + return pd.DataFrame({ + 'frame': self.frame_ids, + 'scan': self.scans, + 'retention_time': self.retention_times, + 'mobility': self.mobilities, + 'intensity': self.intensities + }) + + def __repr__(self): + return (f"TimsPlane(mz_mean: " + f"{np.round(self.mz_mean, 4)}, " + f"mz_std: {np.round(self.mz_std, 4)}," + f" tof_mean: {np.round(self.tof_mean, 4)}, " + f"tof_std: {np.round(self.tof_std, 4)}, " + f"num_points: {len(self.frame_ids)})") diff --git a/imspy/imspy/data/spectrum.py b/imspy/imspy/data/spectrum.py new file mode 100644 index 00000000..80baa757 --- /dev/null +++ b/imspy/imspy/data/spectrum.py @@ -0,0 +1,377 @@ +import numpy as np +from typing import List, Tuple + +import pandas as pd +from numpy.typing import NDArray + +import imspy_connector as pims + + +class IndexedMzSpectrum: + def __init__(self, index: NDArray[np.int32], mz: NDArray[np.float64], intensity: NDArray[np.float64]): + """IndexedMzSpectrum class. + + Args: + index (NDArray[np.int32]): Index. + mz (NDArray[np.float64]): m/z. + intensity (NDArray[np.float64]): Intensity. + + Raises: + AssertionError: If the length of the index, mz and intensity arrays are not equal. + """ + assert len(index) == len(mz) == len(intensity), ("The length of the index, mz and intensity arrays must be " + "equal.") + self.__spec_ptr = pims.PyIndexedMzSpectrum(index, mz, intensity) + + @classmethod + def from_py_indexed_mz_spectrum(cls, spec: pims.PyIndexedMzSpectrum): + """Create a IndexedMzSpectrum from a PyIndexedMzSpectrum. + + Args: + spec (pims.PyIndexedMzSpectrum): PyIndexedMzSpectrum to create the IndexedMzSpectrum from. + + Returns: + IndexedMzSpectrum: IndexedMzSpectrum created from the PyIndexedMzSpectrum. + """ + instance = cls.__new__(cls) + instance.__spec_ptr = spec + return instance + + @property + def index(self) -> NDArray[np.int32]: + """Index. + + Returns: + NDArray[np.int32]: Index. + """ + return self.__spec_ptr.index + + @property + def mz(self) -> NDArray[np.float64]: + """m/z. + + Returns: + NDArray[np.float64]: m/z. + """ + return self.__spec_ptr.mz + + @property + def intensity(self) -> NDArray[np.float64]: + """Intensity. + + Returns: + NDArray[np.float64]: Intensity. + """ + return self.__spec_ptr.intensity + + @property + def df(self) -> pd.DataFrame: + """Data. + + Returns: + pd.DataFrame: Data. + """ + + return pd.DataFrame({'index': self.index, 'mz': self.mz, 'intensity': self.intensity}) + + def __repr__(self): + return f"IndexedMzSpectrum(num_peaks={len(self.index)})" + + +class MzSpectrum: + def __init__(self, mz: NDArray[np.float64], intensity: NDArray[np.float64]): + """MzSpectrum class. + + Args: + mz (NDArray[np.float64]): m/z. + intensity (NDArray[np.float64]): Intensity. + + Raises: + AssertionError: If the length of the mz and intensity arrays are not equal. + """ + assert len(mz) == len(intensity), "The length of the mz and intensity arrays must be equal." + self.__spec_ptr = pims.PyMzSpectrum(mz, intensity) + + @property + def mz(self) -> NDArray[np.float64]: + """m/z. + + Returns: + NDArray[np.float64]: m/z. + """ + return self.__spec_ptr.mz + + @property + def intensity(self) -> NDArray[np.float64]: + """Intensity. + + Returns: + NDArray[np.float64]: Intensity. + """ + return self.__spec_ptr.intensity + + @property + def df(self) -> pd.DataFrame: + """Data. + + Returns: + pd.DataFrame: Data. + """ + + return pd.DataFrame({'mz': self.mz, 'intensity': self.intensity}) + + @classmethod + def from_py_mz_spectrum(cls, spec: pims.PyMzSpectrum): + """Create a MzSpectrum from a PyMzSpectrum. + + Args: + spec (pims.PyMzSpectrum): PyMzSpectrum to create the MzSpectrum from. + + Returns: + MzSpectrum: MzSpectrum created from the PyMzSpectrum. + """ + instance = cls.__new__(cls) + instance.__spec_ptr = spec + return instance + + def __repr__(self): + return f"MzSpectrum(num_peaks={len(self.mz)})" + + def to_windows(self, window_length: float = 10, overlapping: bool = True, min_num_peaks: int = 5, + min_intensity: float = 1) -> Tuple[NDArray, List['MzSpectrum']]: + """Convert the spectrum to a list of windows. + + Args: + window_length (float, optional): Window length. Defaults to 10. + overlapping (bool, optional): Whether the windows should overlap. Defaults to True. + min_num_peaks (int, optional): Minimum number of peaks in a window. Defaults to 5. + min_intensity (float, optional): Minimum intensity of a peak in a window. Defaults to 1. + + Returns: + Tuple[NDArray, List[MzSpectrum]]: List of windows. + """ + + indices, windows = self.__spec_ptr.to_windows(window_length, overlapping, min_num_peaks, min_intensity) + return indices, [MzSpectrum.from_py_mz_spectrum(window) for window in windows] + + def filter(self, mz_min: float = 0.0, mz_max: float = 2000.0, intensity_min: float = 0.0, + intensity_max: float = 1e9) -> 'MzSpectrum': + """Filter the spectrum for a given m/z range and intensity range. + + Args: + mz_min (float): Minimum m/z value. + mz_max (float): Maximum m/z value. + intensity_min (float, optional): Minimum intensity value. Defaults to 0.0. + intensity_max (float, optional): Maximum intensity value. Defaults to 1e9. + + Returns: + MzSpectrum: Filtered spectrum. + """ + return MzSpectrum.from_py_mz_spectrum( + self.__spec_ptr.filter_ranged(mz_min, mz_max, intensity_min, intensity_max)) + + def vectorized(self, resolution: int = 2) -> 'MzSpectrumVectorized': + """Convert the spectrum to a vectorized spectrum. + + Args: + resolution (int, optional): Resolution. Defaults to 2. + + Returns: + MzSpectrumVectorized: Vectorized spectrum. + """ + return MzSpectrumVectorized.from_py_mz_spectrum_vectorized(self.__spec_ptr.vectorized(resolution)) + + +class MzSpectrumVectorized: + def __init__(self, indices: NDArray[np.int32], values: NDArray[np.float64], resolution: int): + """MzSpectrum class. + + Args: + mz (NDArray[np.float64]): m/z. + values (NDArray[np.float64]): Intensity. + + Raises: + AssertionError: If the length of the mz and intensity arrays are not equal. + """ + assert len(indices) == len(values), "The length of the mz and intensity arrays must be equal." + self.__spec_ptr = pims.PyMzSpectrumVectorized(indices, values, resolution) + + @classmethod + def from_py_mz_spectrum_vectorized(cls, spec: pims.PyMzSpectrumVectorized): + """Create a MzSpectrum from a PyMzSpectrum. + + Args: + spec (pims.PyMzSpectrum): PyMzSpectrum to create the MzSpectrum from. + + Returns: + MzSpectrum: MzSpectrum created from the PyMzSpectrum. + """ + instance = cls.__new__(cls) + instance.__spec_ptr = spec + return instance + + @property + def resolution(self) -> float: + """Resolution. + + Returns: + float: Resolution. + """ + return self.__spec_ptr.resolution + + @property + def indices(self) -> NDArray[np.int32]: + """m/z. + + Returns: + NDArray[np.float64]: m/z. + """ + return self.__spec_ptr.indices + + @property + def values(self) -> NDArray[np.float64]: + """Intensity. + + Returns: + NDArray[np.float64]: Intensity. + """ + return self.__spec_ptr.values + + def __repr__(self): + return f"MzSpectrumVectorized(num_values={len(self.values)})" + + +class TimsSpectrum: + def __init__(self, frame_id: int, scan: int, retention_time: float, mobility: float, ms_type: int, + index: NDArray[np.int32], mz: NDArray[np.float64], intensity: NDArray[np.float64]): + """TimsSpectrum class. + + Args: + index (NDArray[np.int32]): Index. + mz (NDArray[np.float64]): m/z. + intensity (NDArray[np.float64]): Intensity. + + Raises: + AssertionError: If the length of the index, mz and intensity arrays are not equal. + """ + assert len(index) == len(mz) == len(intensity), ("The length of the index, mz and intensity arrays must be " + "equal.") + self.__spec_ptr = pims.PyTimsSpectrum(frame_id, scan, retention_time, mobility, ms_type, index, mz, intensity) + + @classmethod + def from_py_tims_spectrum(cls, spec: pims.PyTimsSpectrum): + """Create a TimsSpectrum from a PyTimsSpectrum. + + Args: + spec (pims.PyTimsSpectrum): PyTimsSpectrum to create the TimsSpectrum from. + + Returns: + TimsSpectrum: TimsSpectrum created from the PyTimsSpectrum. + """ + instance = cls.__new__(cls) + instance.__spec_ptr = spec + return instance + + @property + def index(self) -> NDArray[np.int32]: + """Index. + + Returns: + NDArray[np.int32]: Index. + """ + return self.__spec_ptr.index + + @property + def mz(self) -> NDArray[np.float64]: + """m/z. + + Returns: + NDArray[np.float64]: m/z. + """ + return self.__spec_ptr.mz + + @property + def intensity(self) -> NDArray[np.float64]: + """Intensity. + + Returns: + NDArray[np.float64]: Intensity. + """ + return self.__spec_ptr.intensity + + @property + def ms_type(self) -> str: + """MS type. + + Returns: + str: MS type. + """ + return self.__spec_ptr.ms_type + + @property + def mobility(self) -> float: + """Inverse mobility. + + Returns: + float: Inverse mobility. + """ + return self.__spec_ptr.mobility + + @property + def scan(self) -> int: + """Scan. + + Returns: + int: Scan. + """ + return self.__spec_ptr.scan + + @property + def retention_time(self) -> float: + """Retention time. + + Returns: + float: Retention time. + """ + return self.__spec_ptr.retention_time + + @property + def frame_id(self) -> int: + """Frame ID. + + Returns: + int: Frame ID. + """ + return self.__spec_ptr.frame_id + + @property + def mz_spectrum(self) -> MzSpectrum: + """Get the MzSpectrum. + + Returns: + MzSpectrum: Spectrum. + """ + return MzSpectrum.from_py_mz_spectrum(self.__spec_ptr.mz_spectrum) + + @property + def df(self) -> pd.DataFrame: + """Data. + + Returns: + pd.DataFrame: Data. + """ + + return pd.DataFrame({ + 'frame': np.repeat(self.frame_id, len(self.index)), + 'retention_time': np.repeat(self.retention_time, len(self.index)), + 'scan': np.repeat(self.scan, len(self.index)), + 'mobility': np.repeat(self.mobility, len(self.index)), + 'tof': self.index, + 'mz': self.mz, + 'intensity': self.intensity + }) + + def __repr__(self): + return (f"TimsSpectrum(id={self.frame_id}, retention_time={np.round(self.retention_time, 2)}, " + f"scan={self.scan}, mobility={np.round(self.mobility, 2)}, ms_type={self.ms_type}, " + f"num_peaks={len(self.index)})") diff --git a/imspy/imspy/feature.py b/imspy/imspy/feature.py new file mode 100644 index 00000000..9abf5c5c --- /dev/null +++ b/imspy/imspy/feature.py @@ -0,0 +1,211 @@ +import pandas as pd +import numpy as np +from numpy.typing import ArrayLike + +import json + +from pyims.data import TimsSlice, TimsFrame, MzSpectrum +from pyims.utility import gaussian, exp_gaussian +from pyims.isotopes import IsotopePatternGenerator, create_initial_feature_distribution +from abc import ABC, abstractmethod +from typing import Optional, Dict +class Profile: + + def __init__(self,positions:Optional[ArrayLike] = None, rel_abundancies:Optional[ArrayLike] = None, model_params: Optional[Dict] = None, jsons:Optional[str] = None): + if jsons is not None: + self._jsons = jsons + self.positions, self.rel_abundancies, self.model_params, self.access_dictionary = self._from_jsons(jsons) + else: + self.positions = np.asarray(positions) + self.rel_abundancies = np.asarray(rel_abundancies) + self.model_params = model_params + self.access_dictionary = {p:i for p,i in zip(positions, rel_abundancies)} + self._jsons = self._to_jsons() + + def __iter__(self): + self.__size = len(self.positions) + self.__iterator_pos = 0 + return self + + def __next__(self): + if self.__iterator_pos < self.__size: + p = self.positions[self.__iterator_pos] + ra = self.rel_abundancies[self.__iterator_pos] + self.__iterator_pos += 1 + return p,ra + raise StopIteration + + def __getitem__(self, position: int): + return self.access_dictionary[position] + + def _to_jsons(self): + mp = {} + for key,val in self.model_params.items(): + if isinstance(val,np.generic): + mp[key] = val.item() + else: + mp[key] = val + + json_dict = {"positions": self.positions.tolist(), + "rel_abundancies": self.rel_abundancies.tolist(), + "model_params": mp} + return json.dumps(json_dict, allow_nan = False) + + def _from_jsons(self, jsons:str): + json_dict = json.loads(jsons) + positions = json_dict["positions"] + rel_abundancies = json_dict["rel_abundancies"] + access_dictionary = {p:i for p,i in zip(positions, rel_abundancies)} + return positions,rel_abundancies,json_dict["model_params"], access_dictionary + + @property + def jsons(self): + return self._jsons + +class RTProfile(Profile): + + def __init__(self,frames:Optional[ArrayLike] = None, rel_abundancies:Optional[ArrayLike] = None, model_params: Optional[Dict] = None, jsons:Optional[str] = None): + super().__init__(frames, rel_abundancies, model_params, jsons) + + @property + def frames(self): + return self.positions + +class ScanProfile(Profile): + + def __init__(self,scans:Optional[ArrayLike] = None, rel_abundancies:Optional[ArrayLike] = None, model_params: Optional[Dict] = None, jsons:Optional[str] = None): + super().__init__(scans, rel_abundancies, model_params, jsons) + + @property + def scans(self): + return self.positions + +class ChargeProfile(Profile): + def __init__(self,charges:Optional[ArrayLike] = None, rel_abundancies:Optional[ArrayLike] = None, model_params: Optional[Dict] = None, jsons:Optional[str] = None): + abundant_charges = charges[rel_abundancies>0] + relative_abundancies_over_0 = rel_abundancies[rel_abundancies>0] + super().__init__(abundant_charges, relative_abundancies_over_0, model_params, jsons) + + @property + def charges(self): + return self.positions + +class FeatureGenerator(ABC): + def __init__(self): + pass + + @abstractmethod + def generate_feature(self, mz: float, charge: int): + pass + + +class PrecursorFeatureGenerator(FeatureGenerator): + + def __init__(self): + super(PrecursorFeatureGenerator).__init__() + + def generate_feature(self, mz: float, charge: int, + pattern_generator: IsotopePatternGenerator, + num_rt: int = 64, + num_im: int = 32, + distr_im=gaussian, + distr_rt=exp_gaussian, + rt_lower: float = -9, + rt_upper: float = 18, + im_lower: float = -4, + im_upper: float = 4, + intensity_amplitude: float = 1e3, + min_intensity: int = 5) -> TimsSlice: + + I = create_initial_feature_distribution(num_rt, num_im, rt_lower, rt_upper, + im_lower, im_upper, distr_im, distr_rt) + + frame_list = [] + + spec = pattern_generator.generate_spectrum(mz, charge, min_intensity=5, centroided=True, k=12) + mz, intensity = spec.mz(), spec.intensity() / np.max(spec.intensity()) * 100 + + for i in range(num_rt): + + scan_arr, mz_arr, intensity_arr, tof_arr, inv_mob_arr = [], [], [], [], [] + + for j in range(num_im): + intensity_scaled = intensity * I[i, j] + intensity_scaled = intensity_scaled * intensity_amplitude + int_intensity = np.array([int(x) for x in intensity_scaled]) + + bt = [(x, y) for x, y in zip(mz, int_intensity) if y >= min_intensity] + + mz_tmp = np.array([x for x, y in bt]) + intensity_tmp = np.array([y for x, y in bt]).astype(np.int32) + + rts = np.repeat(i, intensity_tmp.shape[0]).astype(np.float32) + scans = np.repeat(j, intensity_tmp.shape[0]).astype(np.int32) + tof = np.ones_like(rts).astype(np.int32) + inv_mob = np.ones_like(scans) + + scan_arr.append(scans) + mz_arr.append(mz_tmp) + intensity_arr.append(intensity_tmp) + tof_arr.append(tof) + inv_mob_arr.append(inv_mob) + + frame = TimsFrame(None, i, float(i), + np.hstack(scan_arr), np.hstack(mz_arr), np.hstack(intensity_arr), + np.hstack(tof_arr), np.hstack(inv_mob_arr)) + + frame_list.append(frame) + + return TimsSlice(None, frame_list, []) + + +class FeatureBatch: + def __init__(self, feature_table: pd.DataFrame, raw_data: TimsSlice): + """ + + :param feature_table: + :param raw_data: + """ + self.feature_table = feature_table.sort_values(['rt_length'], ascending=False) + self.raw_data = raw_data + self.__feature_counter = 0 + self.current_row = self.feature_table.iloc[0] + r = self.current_row + self.current_feature = self.get_feature(r.mz - 0.1, r.mz + (r.num_peaks / r.charge) + 0.1, + r.im_start, + r.im_stop, + r.rt_start, + r.rt_stop) + + def __repr__(self): + return f'FeatureBatch(features={self.feature_table.shape[0]}, slice={self.raw_data})' + + def get_feature(self, mz_min, mz_max, scan_min, scan_max, rt_min, rt_max, intensity_min=20): + return self.raw_data.filter_ranged(mz_min=mz_min, + mz_max=mz_max, + scan_min=scan_min, + scan_max=scan_max, + intensity_min=intensity_min, + rt_min=rt_min, + rt_max=rt_max) + + def __iter__(self): + return self.current_row, self.current_feature + + def __next__(self): + feature = self.current_row + data = self.current_feature + + if self.__feature_counter < self.feature_table.shape[0]: + self.__feature_counter += 1 + self.current_row = self.feature_table.iloc[self.__feature_counter] + r = self.current_row + self.current_feature = self.get_feature(r.mz - 0.1, + r.mz + (r.num_peaks / r.charge) + 0.1, + r.im_start, + r.im_stop, + r.rt_start, + r.rt_stop) + return feature, data + + raise StopIteration diff --git a/imspy/imspy/handle.py b/imspy/imspy/handle.py index 8c8430b3..4aec5145 100644 --- a/imspy/imspy/handle.py +++ b/imspy/imspy/handle.py @@ -10,8 +10,13 @@ from abc import ABC +<<<<<<<< HEAD:imspy/imspy/handle.py from imspy.frame import TimsFrame from imspy.slice import TimsSlice +======== +from pyims.data import TimsFrame, TimsSlice + +>>>>>>>> 1cd02c1 (transfer simulation framework from proteolizard):imspy/imspy/data/handle.py class TimsDataset(ABC): diff --git a/imspy/imspy/isotopes.py b/imspy/imspy/isotopes.py new file mode 100644 index 00000000..49b874df --- /dev/null +++ b/imspy/imspy/isotopes.py @@ -0,0 +1,242 @@ +from __future__ import annotations +from typing import Optional, Tuple +import warnings +import numpy as np +from numpy.typing import ArrayLike +from abc import ABC, abstractmethod + +from scipy.signal import argrelextrema +from pyims.data import MzSpectrum +from pyims.utility import gaussian, exp_gaussian, normal_pdf +import numba +import pyopenms + +from pyims.noise import detection_noise + +MASS_PROTON = 1.007276466621 +MASS_NEUTRON = 1.00866491595 + + +@numba.jit(nopython=True) +def factorial(n: int): + if n <= 0: + return 1 + return n * factorial(n - 1) + + +# linear dependency of mass +@numba.jit(nopython=True) +def lam(mass: float, slope: float = 0.000594, intercept: float = -0.03091): + """ + :param intercept: + :param slope: + :param mass: + :return: + """ + return slope * mass + intercept + + +@numba.jit(nopython=True) +def weight(mass: float, peak_nums: ArrayLike, normalize: bool = True): + """ + :param mass: + :param num_steps: + :param normalize: + :return: + """ + factorials = np.zeros_like(peak_nums) + norm = 1 + for i,k in enumerate(peak_nums): + factorials[i] = factorial(k) + weights = np.exp(-lam(mass)) * np.power(lam(mass), peak_nums) / factorials + if normalize: + norm = weights.sum() + return weights/norm + +def get_pyopenms_weights(sequence: str, peak_nums: ArrayLike, generator: pyopenms.CoarseIsotopePatternGenerator): + """ + Gets hypothetical intensities of isotopic peaks. + + :param sequence: Peptide sequence in proForma format. + :type sequence: str + :param generator: pyopenms generator for isotopic pattern calculation + :type generator: pyopenms.CoarseIsotopePatternGenerator + :return: List of isotopic peak intensities + :rtype: List + """ + + n = peak_nums.shape[0] + generator.setMaxIsotope(n) + aa_sequence = sequence.strip("_") + peptide= pyopenms.AASequence().fromString(aa_sequence.replace("[","(").replace("]",")")) + formula = peptide.getFormula() + distribution = generator.run(formula) + intensities = [i.getIntensity() for i in distribution.getContainer()] + return intensities + +@numba.jit(nopython=True) +def iso(x: ArrayLike, mass: float, charge: float, sigma: float, amp: float, K: int, step_size:float, add_detection_noise: bool = False, mass_neutron: float = MASS_NEUTRON): + """ + :param mass_neutron: + :param x: + :param mass: + :param charge: + :param sigma: + :param amp: + :param K:4 + :param step_size: + :param add_detection_noise: + :return: + """ + k = np.arange(K) + means = ((mass + mass_neutron * k) / charge).reshape((1,-1)) + weights = weight(mass,k).reshape((1,-1)) + intensities = np.sum(weights*normal_pdf(x.reshape((-1,1)), means, sigma), axis= 1)*step_size + if add_detection_noise: + return detection_noise(intensities*amp) + else: + return intensities * amp + + +@numba.jit(nopython=True) +def numba_generate_pattern(lower_bound: float, + upper_bound: float, + mass: float, + charge: float, + amp: float, + k: int, + sigma: float = 0.008492569002123142, + resolution: int = 3): + """ + :param lower_bound: + :param upper_bound: + :param mass: + :param charge: + :param amp: + :param k: + :param sigma: + :param resolution: + :return: + """ + step_size = min(sigma/10,1/np.power(10,resolution)) + size = int((upper_bound-lower_bound)//step_size+1) + mzs = np.linspace(lower_bound,upper_bound,size) + intensities = iso(mzs,mass,charge,sigma,amp,k,step_size) + + return mzs + MASS_PROTON, intensities.astype(np.int64) + +#@numba.jit(nopython= True) +def numba_ion_sampler(mass: float, charge: int, sigma: ArrayLike, k: int, ion_count: int, intensity_per_ion: int ): + sigma = np.atleast_1d(sigma) + if sigma.size == 1: + sigma = np.repeat(sigma,k) + if sigma.size != k: + raise ValueError("Sigma must be either length 1 or k") + + + us = np.zeros(ion_count) + devs_std = np.zeros(ion_count) + for ion in range(ion_count): + u = np.random.uniform() + dev_std = np.random.normal() + us[ion] = u + devs_std[ion] = dev_std + + weights = weight(mass, np.arange(k)).cumsum() + + def _get_component(u: float, weights_cum_sum: ArrayLike = weights) -> int: + for idx, weight_sum in enumerate(weights_cum_sum): + if u < weight_sum: + return idx + + comps = np.zeros_like(us) + devs = np.zeros_like(us) + for idx, u in enumerate(us): + comp = _get_component(u) + comps[idx] = comp + devs[idx] = sigma[comp] * devs_std[idx] + + mz = ((comps*MASS_NEUTRON) + mass) / charge + MASS_PROTON + devs + i = np.ones_like(mz) * intensity_per_ion + return (mz, i.astype(np.int64)) + +@numba.jit +def create_initial_feature_distribution(num_rt: int, num_im: int, + rt_lower: float = -9, + rt_upper: float = 18, + im_lower: float = -4, + im_upper: float = 4, + distr_im=gaussian, + distr_rt=exp_gaussian) -> np.array: + + I = np.ones((num_rt, num_im)).astype(np.float32) + + for i, x in enumerate(np.linspace(im_lower, im_upper, num_im)): + for j, y in enumerate(np.linspace(rt_lower, rt_upper, num_rt)): + I[j, i] *= (distr_im(x) * distr_rt(y)) + + return I + + +class IsotopePatternGenerator(ABC): + def __init__(self): + pass + @abstractmethod + def generate_pattern(self, mass: float, charge: int) -> Tuple[ArrayLike, ArrayLike]: + pass + + @abstractmethod + def generate_spectrum(self, mass: int, charge: int, min_intensity: int) -> MzSpectrum: + pass + + +class AveragineGenerator(IsotopePatternGenerator): + def __init__(self): + super().__init__() + self.default_abundancy = 1e4 + + def generate_pattern(self, mass: float, charge: int, k: int = 7, + amp: Optional[float] = None, resolution: float = 3, + min_intensity: int = 5) -> Tuple[ArrayLike, ArrayLike]: + pass + + def generate_spectrum(self, mass: int, charge: int, frame_id: int, scan_id: int, k: int = 7, + amp :Optional[float] = None, resolution:float =3, min_intensity: int = 5, centroided: bool = True) -> MzSpectrum: + if amp is None: + amp = self.default_abundancy + + lb = mass / charge - .2 + ub = mass / charge + k + .2 + + mz, i = numba_generate_pattern(lower_bound=lb, upper_bound=ub, + mass=mass, charge=charge, amp=amp, k=k, resolution=resolution) + + if centroided: + return MzSpectrum(None, frame_id, scan_id, mz, i)\ + .to_resolution(resolution).filter(lb,ub,min_intensity)\ + .to_centroided(baseline_noise_level=max(min_intensity,1), sigma=1/np.power(10,resolution-1)) + + return MzSpectrum(None, frame_id, scan_id, mz, i).to_resolution(resolution).filter(lb,ub,min_intensity) + + def generate_spectrum_by_sampling(self, + mass: float, + charge: int, + frame_id:int, + scan_id: int, + k:int =7, + sigma: ArrayLike = 0.008492569002123142, + ion_count:int = 1000000, + resolution:float = 3, + intensity_per_ion:int = 1, + centroided = True, + min_intensity = 5) -> MzSpectrum: + + assert 100 <= mass / charge <= 2000, f"m/z should be between 100 and 2000, was: {mass / charge}" + mz, i = numba_ion_sampler(mass, charge, sigma, k, ion_count, intensity_per_ion) + + if centroided: + return MzSpectrum(None,frame_id,scan_id,mz,i)\ + .to_resolution(resolution).filter(-1,-1,min_intensity)\ + .to_centroided(baseline_noise_level=max(min_intensity,1), sigma=1/np.power(10,resolution-1)) + return MzSpectrum(None,frame_id,scan_id,mz,i)\ + .to_resolution(resolution).filter(-1,-1,min_intensity) \ No newline at end of file diff --git a/imspy/imspy/noise.py b/imspy/imspy/noise.py new file mode 100644 index 00000000..1702faef --- /dev/null +++ b/imspy/imspy/noise.py @@ -0,0 +1,124 @@ +""" This module contains several noise models, + such as detection noise, shot noise and baseline noise. +""" +import numpy as np +import numba +from typing import Callable, Optional, Tuple +from numpy.typing import ArrayLike +from pyims.data import MzSpectrum +from pyims.utility import normal_pdf + +@numba.jit(nopython=True) +def mu_function_normal_default(intensity: ArrayLike) -> ArrayLike: + return intensity + +@numba.jit(nopython=True) +def sigma_function_normal_default(intensity:ArrayLike) -> ArrayLike: + return np.sqrt(intensity) + +@numba.jit(nopython=True) +def mu_function_poisson_default(intensity:ArrayLike) -> ArrayLike: + offset = 0 + return intensity+ offset + +@numba.jit(nopython=True) +def detection_noise(signal: ArrayLike, + method: str = "poisson", + custom_mu_function: Optional[Callable] = None, + custom_sigma_function: Optional[Callable] = None) -> ArrayLike: + """ + + :param signal: + :param method: + :param custom_mu_function: + :param custom_sigma_function: + """ + if method == "normal": + if custom_sigma_function is None: + sigma_function:Callable = sigma_function_normal_default + else: + sigma_function:Callable = custom_sigma_function + if custom_mu_function is None: + mu_function:Callable = mu_function_normal_default + else: + mu_function:Callable = custom_mu_function + + sigmas = sigma_function(signal) + mus = mu_function(signal) + noised_signal = np.zeros_like(mus) + + for i in range(noised_signal.size): + s = sigmas[i] + m = mus[i] + n = np.random.normal()*s+m + noised_signal[i] = n + + elif method == "poisson": + if custom_sigma_function is not None: + raise ValueError("Sigma function is not used if method is 'poisson'") + if custom_mu_function is None: + mu_function:Callable = mu_function_poisson_default + else: + mu_function:Callable = custom_mu_function + mus = mu_function(signal) + noised_signal = np.zeros_like(mus) + + for i in range(noised_signal.size): + mu = mus[i] + n = np.random.poisson(lam = mu) + noised_signal[i] = n + + else: + raise NotImplementedError("This method is not implemented, choose 'normal' or 'poisson'.") + + return noised_signal + +@numba.jit(nopython=True) +def generate_noise_peak(pos:float, sigma: float, intensity: float, min_intensity:int = 0, resolution:float = 3): + step_size = min(sigma/10,1/np.power(10,resolution)) + lower = int((pos-4*sigma)//step_size) + upper = int((pos+4*sigma)//step_size) + mzs = np.arange(lower,upper)*step_size + intensities = normal_pdf(mzs,pos,sigma,normalize=True)*intensity*step_size + return (mzs[intensities>=min_intensity], intensities[intensities>=min_intensity].astype(np.int64)) + + +def baseline_shot_noise_window(window:MzSpectrum, + window_theoretical_mz_min:float, + window_theoretical_mz_max:float, + expected_noise_peaks: int = 5, + expected_noise_intensity: float = 10, + expected_noise_sigma:float = 0.001, + resolution:float = 3) -> MzSpectrum: + """ + + """ + num_noise_peaks = np.random.poisson(lam=expected_noise_peaks) + noised_window = MzSpectrum(None,-1,-1,[],[]) + for i in range(num_noise_peaks): + location_i = np.random.uniform(window_theoretical_mz_min,window_theoretical_mz_max) + intensity_i = np.random.exponential(expected_noise_intensity) + sigma_i = np.random.exponential(expected_noise_sigma) + noise_mz, noise_intensity = generate_noise_peak(location_i,sigma_i,intensity_i,resolution=resolution) + noise_peak = MzSpectrum(None,-1,-1, noise_mz, noise_intensity) + noised_window = noised_window+noise_peak + return noised_window + + +def baseline_shot_noise(spectrum:MzSpectrum,window_size:float=50,expected_noise_peaks_per_Th:int=10,min_intensity:int = 5, resolution = 3): + """ + + + """ + min_mz = spectrum.mz().min()-0.1 + max_mz = spectrum.mz().max()+0.1 + bins,windows = spectrum.windows(window_size,overlapping=False,min_peaks=0,min_intensity=0) + noised_spectrum = spectrum + for b,w in zip(bins,windows): + lower = b*window_size + upper = (b+1)*window_size + noised_spectrum = noised_spectrum+baseline_shot_noise_window(w,lower,upper,window_size*expected_noise_peaks_per_Th) + return noised_spectrum.to_resolution(resolution).filter(min_mz,max_mz,min_intensity) + +def baseline_noise(): + pass \ No newline at end of file diff --git a/imspy/imspy/proteome.py b/imspy/imspy/proteome.py new file mode 100644 index 00000000..e4490467 --- /dev/null +++ b/imspy/imspy/proteome.py @@ -0,0 +1,322 @@ +from __future__ import annotations +import os +import warnings +import numpy as np +from numpy.typing import ArrayLike +import pandas as pd +import sqlite3 +from pyims.data import MzSpectrum +from pyims.chemistry import get_mono_isotopic_weight, MASS_PROTON + +from pyims.feature import RTProfile, ScanProfile, ChargeProfile +from pyims.utility import tokenize_proforma_sequence, TokenSequence, get_aa_num_proforma_sequence +from enum import Enum +from abc import ABC, abstractmethod + +from typing import Optional, List, Union + +class ENZYME(Enum): + TRYPSIN = 1 + + +class ORGANISM(Enum): + HOMO_SAPIENS = 1 + + +class Enzyme(ABC): + def __init__(self, name: ENZYME): + self.name = name + + @abstractmethod + def calculate_cleavages(self, sequence: str) -> np.array: + pass + + def digest(self, sequence: str, missed_cleavages: int, min_length) -> list[str]: + pass + + +class Trypsin(Enzyme): + def __init__(self, name: ENZYME = ENZYME.TRYPSIN): + super().__init__(name) + self.name = name + + def calculate_cleavages(self, sequence: str): + """ + Scans sequence for cleavage motifs + and returns list of tuples of peptide intervals. + + :param sequence: protein sequence to digest + :type sequence: str + :return: beginning with + :rtype: List[Tuple[int,int]] + """ + cut_sites = [] + start = 0 + end = 0 + in_unimod_bracket = False + for i,aa in enumerate(sequence[:-1]): + # skip unimod brackets + if aa == "(": + in_unimod_bracket = True + if in_unimod_bracket: + if aa == ")": + in_unimod_bracket = False + continue + + # cut after every 'K' or 'R' that is not followed by 'P' + + next_aa = sequence[i+1] + if ((aa == 'K') or (aa == 'R')) and next_aa != 'P': + end = i+1 + cut_sites.append((start, end)) + start = end + + cut_sites.append((start, len(sequence))) + + return np.array(cut_sites) + + def __repr__(self): + return f'Enzyme(name: {self.name.name})' + + def digest(self, sequence:str, abundancy:float, missed_cleavages:int =0, min_length:int =7): + """ + Digests a protein into peptides. + + :param sequence: Sequence of protein in ProForma standard. + :type sequence: str + :param abundancy: Abundance of protein + :type abundancy: float + :param missed_cleavages: Number of allowed missed cleavages, defaults to 0 + :type missed_cleavages: int, optional + :param min_length: Min length of recorded peptides, defaults to 7 + :type min_length: int, optional + :return: List of dictionaries with `sequence`,`start`, `end` and `abundance` as keys. + :rtype: List[Dict] + """ + assert 0 <= missed_cleavages <= 2, f'Number of missed cleavages might be between 0 and 2, was: {missed_cleavages}' + + peptide_intervals = self.calculate_cleavages(sequence) + + # TODO: implement + if missed_cleavages == 1: + pass + if missed_cleavages == 2: + pass + + dig_list = [] + + for s, e in peptide_intervals: + dig_list.append(sequence[s: e]) + + # pair sequence digests with indices + wi = zip(dig_list, peptide_intervals) + # filter out short sequences and clean index display + wi = map(lambda p: (p[0], p[1][0], p[1][1]), filter(lambda s: get_aa_num_proforma_sequence(s[0]) >= min_length, wi)) + + return list(map(lambda e: {'sequence': e[0], 'start': e[1], 'end': e[2], 'abundancy': abundancy}, wi)) + + +class PeptideDigest: + def __init__(self, data: pd.DataFrame, organism: ORGANISM, enzyme: ENZYME): + self.data = data + self.organism = organism + self.enzyme = enzyme + + def __repr__(self): + return f'PeptideMix(Sample: {self.organism.name}, Enzyme: {self.enzyme.name}, number peptides: {self.data.shape[0]})' + + +class ProteinSample: + + def __init__(self, data: pd.DataFrame, name: ORGANISM): + self.data = data + self.name = name + + def digest(self, enzyme: Enzyme, missed_cleavages: int = 0, min_length: int = 7) -> PeptideDigest: + """ + Digest all proteins in the sample with `enzyme` + + :param enzyme: Enzyme for digestion e.g. instance of `Trypsin` + :type enzyme: Enzyme + :param missed_cleavages: Number of allowed missed cleavages, defaults to 0 + :type missed_cleavages: int, optional + :param min_length: Minimum length of peptide to be recorded, defaults to 7 + :type min_length: int, optional + :return: Digested sample + :rtype: PeptideDigest + """ + # digest with enzyme row-wise (one protein at a time) + digest = self.data.apply(lambda r: enzyme.digest(r['sequence'], r['abundancy'], missed_cleavages, min_length), axis=1) + + V = zip(self.data['id'].values, digest.values) + + r_list = [] + + for (gene, peptides) in V: + for (pep_idx, pep) in enumerate(peptides): + # discard sequences with unknown amino acids + if pep['sequence'].find('X') == -1: + pep['gene_id'] = gene + pep['pep_id'] = f"{gene}_{pep_idx}" + pep['sequence_tokenized'] = tokenize_proforma_sequence(pep['sequence']) + pep['mass_theoretical'] = get_mono_isotopic_weight(pep['sequence_tokenized']) + pep['sequence_tokenized'] = TokenSequence(pep['sequence_tokenized']) + pep['sequence'] = '_' + pep['sequence'] + '_' + r_list.append(pep) + + return PeptideDigest(pd.DataFrame(r_list), self.name, enzyme.name) + + def __repr__(self): + return f'ProteinSample(Organism: {self.name.name})' + +class ProteomicsExperimentDatabaseHandle: + def __init__(self,path:str): + if not os.path.exists(path): + print("Connect to Database") + + self.con = sqlite3.connect(path) + self._chunk_size = None + + def push(self, table_name:str, data:PeptideDigest): + if table_name == "PeptideDigest": + assert isinstance(data, PeptideDigest), "For pushing to table 'PeptideDigest' data type must be `PeptideDigest`" + df = data.data.apply(self.make_sql_compatible) + else: + raise ValueError("This Table does not exist and is not supported") + + df.to_sql(table_name, self.con, if_exists="replace") + + def update(self, data_slice: ProteomicsExperimentSampleSlice): + assert isinstance(data_slice, ProteomicsExperimentSampleSlice) + df_separated_peptides = data_slice.peptides.apply(self.make_sql_compatible) + df_ions = data_slice.ions.apply(self.make_sql_compatible) + df_separated_peptides.to_sql("SeparatedPeptides", self.con, if_exists="append") + df_ions.to_sql("Ions", self.con , if_exists="append") + + def load(self, table_name:str, query:Optional[str] = None): + if query is None: + query = f"SELECT * FROM {table_name}" + return pd.read_sql(query,self.con, index_col="index") + + def load_chunks(self, chunk_size: int, query:Optional[str] = None): + if query is None: + query = "SELECT * FROM PeptideDigest" + self.__chunk_generator = pd.read_sql(query,self.con, chunksize=chunk_size, index_col="index") + for chunk in self.__chunk_generator: + chunk.loc[:,"sequence_tokenized"] = chunk.loc[:,"sequence_tokenized"].transform(lambda st: TokenSequence(None, jsons=st)) + yield(ProteomicsExperimentSampleSlice(peptides = chunk)) + + def load_frames(self, frame_range:Tuple[int,int], spectra_as_jsons = False): + query = ( + "SELECT SeparatedPeptides.pep_id, " + "SeparatedPeptides.sequence, " + "SeparatedPeptides.simulated_frame_profile, " + "SeparatedPeptides.mass_theoretical, " + "SeparatedPeptides.abundancy, " + "SeparatedPeptides.frame_min, " + "SeparatedPeptides.frame_max, " + "Ions.mz, " + "Ions.charge, " + "Ions.relative_abundancy, " + "Ions.scan_min, " + "Ions.scan_max, " + "Ions.simulated_scan_profile, " + "Ions.simulated_mz_spectrum " + "FROM SeparatedPeptides " + "INNER JOIN Ions " + "ON SeparatedPeptides.pep_id = Ions.pep_id " + f"AND SeparatedPeptides.frame_min < {frame_range[1]} " + f"AND SeparatedPeptides.frame_max >= {frame_range[0]} " + ) + df = pd.read_sql(query, self.con) + + # unzip jsons + df.loc[:,"simulated_scan_profile"] = df["simulated_scan_profile"].transform(lambda sp: ScanProfile(jsons=sp)) + df.loc[:,"simulated_frame_profile"] = df["simulated_frame_profile"].transform(lambda rp: RTProfile(jsons=rp)) + if spectra_as_jsons: + return df + else: + df.loc[:,"simulated_mz_spectrum"] = df["simulated_mz_spectrum"].transform(lambda s: MzSpectrum.from_jsons(jsons=s)) + + return df + + @staticmethod + def make_sql_compatible(column): + if column.size < 1: + return + if isinstance(column.iloc[0], (RTProfile, ScanProfile, ChargeProfile, TokenSequence)): + return column.apply(lambda x: x.jsons) + elif isinstance(column.iloc[0], MzSpectrum): + return column.apply(lambda x: x.to_jsons(only_spectrum = True)) + else: + return column + +class ProteomicsExperimentSampleSlice: + """ + exposed dataframe of database + """ + def __init__(self, peptides:pd.DataFrame, ions:Optional[pd.DataFrame]=None): + self.peptides = peptides + self.ions = ions + + def add_simulation(self, simulation_name:str, simulation_data: ArrayLike): + accepted_peptide_simulations = [ + "simulated_irt_apex", + "simulated_frame_apex", + "simulated_frame_profile", + "simulated_charge_profile", + ] + + accepted_ion_simulations = [ + "simulated_scan_apex", + "simulated_k0", + "simulated_scan_profile", + "simulated_mz_spectrum", + ] + + # for profiles store min and max values + get_min_position = np.vectorize(lambda p:p.positions.min(), otypes=[int]) + get_max_position = np.vectorize(lambda p:p.positions.max(), otypes=[int]) + + if simulation_name == "simulated_frame_profile": + + self.peptides["frame_min"] = get_min_position(simulation_data) + self.peptides["frame_max"] = get_max_position(simulation_data) + + elif simulation_name == "simulated_charge_profile": + ions_dict = { + "sequence":[], + "pep_id":[], + "mz":[], + "charge":[], + "relative_abundancy":[] + } + sequences = self.peptides["sequence"].values + pep_ids = self.peptides["pep_id"].values + masses = self.peptides["mass_theoretical"].values + + for (s, pi, m, charge_profile) in zip(sequences, pep_ids, masses, simulation_data): + for (c, r) in charge_profile: + ions_dict["sequence"].append(s) + ions_dict["pep_id"].append(pi) + ions_dict["charge"].append(c) + ions_dict["relative_abundancy"].append(r) + ions_dict["mz"].append(m/c + MASS_PROTON) + self.ions = pd.DataFrame(ions_dict) + + self.peptides["charge_min"] = get_min_position(simulation_data) + self.peptides["charge_max"] = get_max_position(simulation_data) + + elif simulation_name == "simulated_scan_profile": + + self.ions["scan_min"] = get_min_position(simulation_data) + self.ions["scan_max"] = get_max_position(simulation_data) + + if simulation_name in accepted_peptide_simulations: + self.peptides[simulation_name] = simulation_data + + elif simulation_name in accepted_ion_simulations: + self.ions[simulation_name] = simulation_data + + else: + raise ValueError(f"Simulation name '{simulation_name}' is not defined") \ No newline at end of file diff --git a/imspy/imspy/simulation/experiment.py b/imspy/imspy/simulation/experiment.py new file mode 100644 index 00000000..c6896f31 --- /dev/null +++ b/imspy/imspy/simulation/experiment.py @@ -0,0 +1,204 @@ +import os +import json +from multiprocessing import Pool +import functools +from abc import ABC, abstractmethod +import pandas as pd +import numpy as np +import pyarrow as pa +import pyarrow.parquet as pq +from tqdm import tqdm +from pyims.data import MzSpectrum, TimsFrame +from pyims.proteome import PeptideDigest, ProteomicsExperimentSampleSlice, ProteomicsExperimentDatabaseHandle +from pyims.isotopes import AveragineGenerator +import pyims.simulation.hardware_models as hardware + +class ProteomicsExperiment(ABC): + def __init__(self, path: str): + + # path strings to experiment folder, database and output subfolder + self.path = path + self.output_path = f"{os.path.dirname(path)}/output" + self.database_path = f"{self.path}/experiment_database.db" + + if not os.path.exists(self.path): + os.makedirs(self.path) + + if os.path.exists(self.database_path): + raise FileExistsError("Experiment found in the given path.") + + if not os.path.exists(self.output_path): + os.mkdir(self.output_path) + + # output folder must be empty, otherwise it is + # assumend that it contains old experiments + if len(os.listdir(self.output_path)) > 0: + raise FileExistsError("Experiment found in the given path.") + + # init database and loaded sample + self.database = ProteomicsExperimentDatabaseHandle(self.database_path) + self.loaded_sample = None + + # hardware methods + self._lc_method = None + self._ionization_method = None + self._ion_mobility_separation_method = None + self._mz_separation_method = None + + @property + def lc_method(self): + return self._lc_method + + @lc_method.setter + def lc_method(self, method: hardware.LiquidChromatography): + self._lc_method = method + + @property + def ionization_method(self): + return self._ionization_method + + @ionization_method.setter + def ionization_method(self, method: hardware.IonSource): + self._ionization_method = method + + @property + def ion_mobility_separation_method(self): + return self._ion_mobility_separation_method + + @ion_mobility_separation_method.setter + def ion_mobility_separation_method(self, method: hardware.IonMobilitySeparation): + self._ion_mobility_separation_method = method + + @property + def mz_separation_method(self): + return self._mz_separation_method + + @mz_separation_method.setter + def mz_separation_method(self, method: hardware.MzSeparation): + self._mz_separation_method = method + + @abstractmethod + def load_sample(self, sample: PeptideDigest): + self.database.push("PeptideDigest",sample) + + @abstractmethod + def run(self): + pass + + +class LcImsMsMs(ProteomicsExperiment): + def __init__(self, path:str): + super().__init__(path) + + def load_sample(self, sample: PeptideDigest): + return super().load_sample(sample) + + def run(self, chunk_size: int = 1000, assemble_processes: int = 8, frames_per_assemble_process:int = 20): + self._simulate_features(chunk_size) + self._assemble(frames_per_assemble_process, assemble_processes) + + + def _simulate_features(self, chunk_size): + # load bulks of data here as dataframe if necessary + for data_chunk in self.database.load_chunks(chunk_size): + self.lc_method.run(data_chunk) + self.ionization_method.run(data_chunk) + self.ion_mobility_separation_method.run(data_chunk) + self.mz_separation_method.run(data_chunk) + self.database.update(data_chunk) + @staticmethod + def _assemble_frame_range(frame_range, scan_id_min, scan_id_max, default_abundance, resolution, output_path, database_path): + + frame_range_start = frame_range[0] + frame_range_end = frame_range[1] + # generate file_name + file_name = f"frames_{frame_range_start}_{frame_range_end}.parquet" + output_file_path = f"{output_path}/{file_name}" + + frame_range = range(frame_range_start,frame_range_end) + scan_range = range(scan_id_min, scan_id_max+1) + + thread_db_handle = ProteomicsExperimentDatabaseHandle(database_path) + ions_in_split = thread_db_handle.load_frames((frame_range_start, frame_range_end), spectra_as_jsons = True) + + + # skip if no peptides in split + if ions_in_split.shape[0] == 0: + return {} + + # spectra are currently stored in json format (from SQL db) + ions_in_split.loc[:,"simulated_mz_spectrum"] = ions_in_split["simulated_mz_spectrum"].transform(lambda s: MzSpectrum.from_jsons(jsons=s)) + + # construct signal data set + signal = {f_id:{s_id:[] for s_id in scan_range} for f_id in frame_range } + + for _,row in ions_in_split.iterrows(): + + ion_frame_start = max(frame_range_start, row["frame_min"]) + ion_frame_end = min(frame_range_end-1, row["frame_max"]) # -1 here because frame_range_end is covered by next frame range + + ion_scan_start = max(scan_id_min, row["scan_min"]) + ion_scan_end = min(scan_id_max, row["scan_max"]) + + ion_frame_profile = row["simulated_frame_profile"] + ion_scan_profile = row["simulated_scan_profile"] + + ion_charge_abundance = row["abundancy"]*row["relative_abundancy"] + + ion_spectrum = row["simulated_mz_spectrum"] + + # frame start and end inclusive + for f_id in range(ion_frame_start, ion_frame_end+1): + # scan start and end inclusive + for s_id in range(ion_scan_start, ion_scan_end+1): + + abundance = ion_charge_abundance*ion_frame_profile[f_id]*ion_scan_profile[s_id] + rel_to_default_abundance = abundance/default_abundance + + signal[f_id][s_id].append(ion_spectrum*rel_to_default_abundance) + + output_dict = {"frame_id" : [], + "scan_id" : [], + "mz" : [], + "intensity" : [], + } + for (f_id,frame_dict) in signal.items(): + for (s_id,scan_spectra_list) in frame_dict.items(): + + if len(scan_spectra_list) > 0: + scan_spectrum = MzSpectrum.from_mzSpectra_list(scan_spectra_list,resolution = resolution, sigma=1/np.power(10,(resolution-1))) + output_dict["mz"].append(scan_spectrum.mz().tolist()) + output_dict["intensity"].append(scan_spectrum.intensity().tolist()) + output_dict["scan_id"].append(s_id) + output_dict["frame_id"].append(f_id) + signal[f_id][s_id] = None + + for key, value in output_dict.items(): + output_dict[key] = pa.array(value) + + pa_table = pa.Table.from_pydict(output_dict) + + pq.write_table(pa_table, output_file_path, compression=None) + + def _assemble(self, frames_per_process:int, num_processes:int): + + scan_id_min = self.ion_mobility_separation_method.scan_id_min + scan_id_max = self.ion_mobility_separation_method.scan_id_max + default_abundance = self.mz_separation_method.model.default_abundance + resolution = self.mz_separation_method.resolution + + split_positions = np.arange(0, self.lc_method.num_frames , step=frames_per_process ).astype(int) + split_start = split_positions[:-1] + split_end = split_positions[1:] + + assemble_frame_range = functools.partial(self._assemble_frame_range, scan_id_min = scan_id_min, scan_id_max = scan_id_max, default_abundance = default_abundance, resolution = resolution, output_path = self.output_path, database_path= self.database_path) + + if num_processes > 1: + with Pool(num_processes) as pool: + list(tqdm(pool.imap(assemble_frame_range, zip(split_start, split_end)),total=len(split_start))) + else: + for start,end in tqdm(zip(split_start,split_end), total = len(split_start)): + assemble_frame_range((start, end)) + + + diff --git a/imspy/imspy/simulation/hardware_models.py b/imspy/imspy/simulation/hardware_models.py new file mode 100644 index 00000000..817d9797 --- /dev/null +++ b/imspy/imspy/simulation/hardware_models.py @@ -0,0 +1,766 @@ +from __future__ import annotations +from abc import ABC, abstractmethod +from typing import List, Tuple +from multiprocessing import Pool + +import tensorflow as tf +import numpy as np +from numpy.typing import ArrayLike, NDArray +import pandas as pd +from scipy.stats import exponnorm, norm, binom, gamma + +from pyims.chemistry import STANDARD_TEMPERATURE, STANDARD_PRESSURE, CCS_K0_CONVERSION_CONSTANT, BufferGas, get_num_protonizable_sites +from pyims.proteome import ProteomicsExperimentSampleSlice +from pyims.feature import RTProfile, ScanProfile, ChargeProfile +from pyims.isotopes import AveragineGenerator +from pyims.utility import tokenizer_from_json + +class Device(ABC): + def __init__(self, name:str): + self.name = name + self._temperature = STANDARD_TEMPERATURE + self._pressure = STANDARD_PRESSURE + + @property + def temperature(self): + """ + Get device temperature + + :return: Temperature of device in Kelvin. + :rtype: float + """ + return self._temperature + + @temperature.setter + def temperature(self, T:float): + """ + Set device temperature + + :param T: Temperature in Kelvin. + :type T: float + """ + self._temperature = T + + @property + def pressure(self): + """ + Get device pressure + + :return: Pressure of device in Pa. + :rtype: float + """ + return self._pressure + + @pressure.setter + def pressure(self, p:float): + """ + Set device pressure + :param p: Pressure in Pa. + :type p: float + """ + self._pressure = p + + @abstractmethod + def run(self, sample: ProteomicsExperimentSampleSlice): + pass + +class Model(ABC): + def __init__(self): + pass + + @abstractmethod + def simulate(self, sample: ProteomicsExperimentSampleSlice, device: Device): + pass + +class Chromatography(Device): + def __init__(self, name:str="ChromatographyDevice"): + super().__init__(name) + self._apex_model = None + self._profile_model = None + self._irt_to_rt_converter = None + self._frame_length = 1200 + self._gradient_length = 120*60*1000 # 120 minutes in miliseconds + + @property + def frame_length(self): + return self._frame_length + + @frame_length.setter + def frame_length(self, milliseconds: int): + self._frame_length = milliseconds + + @property + def gradient_length(self): + return self._gradient_length/(60*1000) + + @gradient_length.setter + def gradient_length(self, minutes: int): + self._gradient_length = minutes*60*1000 + + @property + def num_frames(self): + return self._gradient_length//self._frame_length + + @property + def apex_model(self): + return self._apex_model + + @apex_model.setter + def apex_model(self, model: ChromatographyApexModel): + self._apex_model = model + + @property + def profile_model(self): + return self._profile_model + + @property + def irt_to_rt_converter(self): + return self._irt_to_rt_converter + + @irt_to_rt_converter.setter + def irt_to_rt_converter(self, converter:callable): + self._irt_to_rt_converter = converter + + @profile_model.setter + def profile_model(self, model:ChromatographyProfileModel): + self._profile_model = model + + def irt_to_frame_id(self, irt: float): + return self.rt_to_frame_id(self.irt_to_rt(irt)) + + @abstractmethod + def rt_to_frame_id(self, rt: float): + pass + + def irt_to_rt(self, irt): + return self.irt_to_rt_converter(irt) + + def frame_time_interval(self, frame_id:ArrayLike): + frame_id = np.atleast_1d(frame_id) + frame_length_minutes = self.frame_length/(60*1000) + s = (frame_id-1)*frame_length_minutes + e = frame_id*frame_length_minutes + return np.stack([s, e], axis = 1) + + def frame_time_middle(self, frame_id: ArrayLike): + return np.mean(self.frame_time_interval(frame_id),axis=1) + + def frame_time_end(self, frame_id: ArrayLike): + return self.frame_time_interval(frame_id)[:,1] + + def frame_time_start(self, frame_id: ArrayLike): + return self.frame_time_interval(frame_id)[:,0] + +class LiquidChromatography(Chromatography): + def __init__(self, name: str = "LiquidChromatographyDevice"): + super().__init__(name) + + def run(self, sample: ProteomicsExperimentSampleSlice): + # retention time apex simulation + retention_time_apex = self._apex_model.simulate(sample, self) + # in irt and rt + sample.add_simulation("simulated_irt_apex", retention_time_apex) + # in frame id + sample.add_simulation("simulated_frame_apex", self.irt_to_frame_id(retention_time_apex)) + + # profile simulation + + retention_profile = self._profile_model.simulate(sample, self) + sample.add_simulation("simulated_frame_profile", retention_profile) + + def rt_to_frame_id(self, rt_minutes: ArrayLike): + rt_minutes = np.asarray(rt_minutes) + # first frame is completed not at 0 but at frame_length + frame_id = (rt_minutes/self.frame_length*1000*60).astype(np.int64)+1 + return frame_id + +class ChromatographyApexModel(Model): + def __init__(self): + self._device = None + + @abstractmethod + def simulate(self, sample: ProteomicsExperimentSampleSlice, device: Chromatography) -> NDArray[np.float64]: + pass + +class ChromatographyProfileModel(Model): + def __init__(self): + pass + + @abstractmethod + def simulate(self, sample: ProteomicsExperimentSampleSlice, device: Chromatography) -> List[RTProfile]: + pass + +class EMGChromatographyProfileModel(ChromatographyProfileModel): + + def __init__(self): + super().__init__() + + def simulate(self, sample: ProteomicsExperimentSampleSlice, device: Chromatography) -> List[RTProfile]: + mus = sample.peptides["simulated_irt_apex"].values + profile_list = [] + + for mu in mus: + σ = 0.1 # must be sampled + λ = 10 # must be sampled + K = 1/(σ*λ) + μ = device.irt_to_rt(mu) + model_params = { "sigma":σ, + "lambda":λ, + "mu":μ, + "name":"EMG" + } + + emg = exponnorm(loc=μ, scale=σ, K = K) + # start and end value (in retention times) + s_rt, e_rt = emg.ppf([0.01,0.9]) + # as frames + s_frame, e_frame = device.rt_to_frame_id(s_rt), device.rt_to_frame_id(e_rt) + + profile_frames = np.arange(s_frame-1,e_frame+1) # starting with s_frame-1 for cdf interval calculation + profile_rt_ends = device.frame_time_end(profile_frames) + profile_rt_cdfs = emg.cdf(profile_rt_ends) + profile_rel_intensities = np.diff(profile_rt_cdfs) + + profile_list.append(RTProfile(profile_frames[1:],profile_rel_intensities,model_params)) + return profile_list + + +class NormalChromatographyProfileModel(ChromatographyProfileModel): + + def __init__(self): + super().__init__() + + def simulate(self, sample: ProteomicsExperimentSampleSlice, device: Chromatography) -> List[RTProfile]: + mus = sample.peptides["simulated_irt_apex"].values + sigmas = gamma(a=4.929,scale=1/18.784).rvs(mus.size)/6 + profile_list = [] + + for mu,σ in zip(mus,sigmas): + + μ = device.irt_to_rt(mu) + model_params = { "sigma":σ, + "mu":μ, + "name":"Normal" + } + + normal = norm(loc=μ, scale=σ) + # start and end value (in retention times) + s_rt, e_rt = normal.ppf([0.02,0.98]) + # as frames + s_frame, e_frame = device.rt_to_frame_id(s_rt), device.rt_to_frame_id(e_rt) + + profile_frames = np.arange(s_frame-1,e_frame+1) # starting with s_frame-1 for cdf interval calculation + profile_rt_ends = device.frame_time_end(profile_frames) + profile_rt_cdfs = normal.cdf(profile_rt_ends) + profile_rel_intensities = np.diff(profile_rt_cdfs) + + profile_list.append(RTProfile(profile_frames[1:],profile_rel_intensities,model_params)) + return profile_list + + +class NeuralChromatographyApex(ChromatographyApexModel): + + def __init__(self, model_path: str, tokenizer_path: str): + super().__init__() + self.model_path = model_path + self.tokenizer = tokenizer_from_json(tokenizer_path) + + def sequences_to_tokens(self, sequences_tokenized: np.array) -> np.array: + print('tokenizing sequences...') + tokens = np.apply_along_axis(self.tokenizer.texts_to_sequences, 0, sequences_tokenized) + tokens_padded = tf.keras.preprocessing.sequence.pad_sequences(tokens, 50, padding='post') + return tokens_padded + + @staticmethod + def _worker(model_path: str, tokens_padded: np.array, batched: bool = True, bs: int = 2048): + pseudo_target = np.expand_dims(np.zeros_like(tokens_padded[:, 0]), axis=1) + if batched: + dataset = tf.data.Dataset.from_tensor_slices((tokens_padded, pseudo_target)).batch(bs) + else: + dataset = tf.data.Dataset.from_tensor_slices((tokens_padded, pseudo_target)) + model = tf.keras.models.load_model(model_path) + print('predicting irts...') + return_val = model.predict(dataset) + return return_val + + def simulate(self, sample: ProteomicsExperimentSampleSlice, device: Chromatography) -> NDArray[np.float64]: + + data = sample.peptides + tokens = data["sequence_tokenized"].apply(lambda st: st.sequence_tokenized) + print('generating tf dataset...') + tokens_padded = self.sequences_to_tokens(tokens) + + with Pool(1) as pool: + r = pool.starmap(self._worker, [(self.model_path, tokens_padded)]) + return r[0] + + +class IonSource(Device): + def __init__(self, name:str ="IonizationDevice"): + super().__init__(name) + self._ionization_model = None + + @property + def ionization_model(self): + return self._ionization_model + + @ionization_model.setter + def ionization_model(self, model: IonizationModel): + self._ionization_model = model + + @abstractmethod + def run(self, sample: ProteomicsExperimentSampleSlice): + pass + +class ElectroSpray(IonSource): + def __init__(self, name:str ="ElectrosprayDevice"): + super().__init__(name) + + def run(self, sample: ProteomicsExperimentSampleSlice): + charge_profiles = self.ionization_model.simulate(sample, self) + sample.add_simulation("simulated_charge_profile", charge_profiles) + +class IonizationModel(Model): + def __init__(self): + pass + + @abstractmethod + def simulate(self, sample:ProteomicsExperimentSampleSlice, device: IonSource) -> NDArray: + pass + +class RandomIonSource(IonizationModel): + def __init__(self): + super().__init__() + self._charge_probabilities = np.array([0.1, 0.5, 0.3, 0.1]) + self._allowed_charges = np.array([1, 2, 3, 4], dtype=np.int8) + + @property + def allowed_charges(self): + return self._allowed_charges + + @allowed_charges.setter + def allowed_charges(self, charges: ArrayLike): + self._allowed_charges = np.asarray(charges, dtype=np.int8) + + @property + def charge_probabilities(self): + return self._charge_probabilities + + @charge_probabilities.setter + def charge_probabilities(self, probabilities: ArrayLike): + self._charge_probabilities = np.asarray(probabilities) + + def simulate(self, sample: ProteomicsExperimentSampleSlice, device: IonSource) -> List[ChargeProfile]: + if self.charge_probabilities.shape != self.allowed_charges.shape: + raise ValueError("Number of allowed charges must fit to number of probabilities") + + data = sample.peptides + charge = np.random.choice(self.allowed_charges, data.shape[0], p=self.charge_probabilities) + rel_intensity = np.ones_like(charge) + charge_profiles = [] + for c,i in zip(charge, rel_intensity): + charge_profiles.append(ChargeProfile([c],[i],model_params={"name":"RandomIonSource"})) + return charge_profiles + +class BinomialIonSource(): + def __init__(self): + super().__init__() + self._charged_probability = 0.5 + self._allowed_charges = np.array([1, 2, 3, 4], dtype=np.int8) + + @property + def allowed_charges(self): + return self._allowed_charges + + @allowed_charges.setter + def allowed_charges(self, charges: ArrayLike): + self._allowed_charges = np.asarray(charges, dtype=np.int8) + + @property + def charged_probability(self): + return self._charged_probability + + @charged_probability.setter + def charged_probability(self, probability: float): + self._charged_probability = probability + + def simulate(self, sample: ProteomicsExperimentSampleSlice, device: IonSource) -> List[ChargeProfile]: + sequences = sample.peptides.sequence + vec_get_num_protonizable_sites = np.vectorize(get_num_protonizable_sites) + basic_aa_nums = vec_get_num_protonizable_sites(sequences) + + charge_profiles = [] + for baa_num in basic_aa_nums: + rel_intensities = binom(baa_num,self.charged_probability).pmf(self.allowed_charges) + charge_profiles.append(ChargeProfile(self.allowed_charges,rel_intensities,model_params={"name":"BinomialIonSource","basic_aa_num":baa_num})) + return charge_profiles + +class IonMobilitySeparation(Device): + def __init__(self, name:str = "IonMobilityDevice"): + super().__init__(name) + # models + self._apex_model = None + self._profile_model = None + + # hardware parameter + self._scan_intervall = 1 + self._scan_time = None + self._scan_id_min = None + self._scan_id_max = None + self._buffer_gas = None + + # converters + self._reduced_im_to_scan_converter = None + self._scan_to_reduced_im_interval_converter = None + + @property + def reduced_im_to_scan_converter(self): + return self._reduced_im_to_scan_converter + + @reduced_im_to_scan_converter.setter + def reduced_im_to_scan_converter(self, converter:callable): + self._reduced_im_to_scan_converter = converter + + @property + def scan_to_reduced_im_interval_converter(self): + return self._scan_to_reduced_im_interval_converter + + @scan_to_reduced_im_interval_converter.setter + def scan_to_reduced_im_interval_converter(self, converter:callable): + self._scan_to_reduced_im_interval_converter = converter + + @property + def buffer_gas(self): + return self._buffer_gas + + @buffer_gas.setter + def buffer_gas(self, gas: BufferGas): + self._buffer_gas = gas + + @property + def scan_intervall(self): + return self._scan_intervall + + @scan_intervall.setter + def scan_intervall(self, number:int): + self._scan_intervall = number + + @property + def scan_id_min(self): + return self._scan_id_min + + @scan_id_min.setter + def scan_id_min(self, number:int): + self._scan_id_min = number + + @property + def scan_id_max(self): + return self._scan_id_max + + @scan_id_max.setter + def scan_id_max(self, number:int): + self._scan_id_max = number + + @property + def scan_time(self): + return self._scan_time + + @scan_time.setter + def scan_time(self, microseconds:int): + self._scan_time = microseconds + + @property + def apex_model(self): + return self._apex_model + + @apex_model.setter + def apex_model(self, model: IonMobilityApexModel): + self._apex_model = model + + @property + def profile_model(self): + return self._profile_model + + @profile_model.setter + def profile_model(self, model: IonMobilityProfileModel): + self._profile_model = model + + def scan_to_reduced_im_interval(self, scan_id: ArrayLike): + return self.scan_to_reduced_im_interval_converter(scan_id) + + def reduced_im_to_scan(self, ion_mobility): + return self.reduced_im_to_scan_converter(ion_mobility) + + def scan_im_middle(self, scan_id: ArrayLike): + return np.mean(self.scan_to_reduced_im_interval(scan_id), axis = 1) + + def scan_im_lower(self, scan_id: ArrayLike): + return self.scan_to_reduced_im_interval(scan_id)[:,0] + + def scan_im_upper(self, scan_id:ArrayLike): + return self.scan_to_reduced_im_interval(scan_id)[:,1] + + def im_to_reduced_im(self, ion_mobility: float, p_0: float = STANDARD_PRESSURE, T_0: float = STANDARD_TEMPERATURE): + """ + Calculate reduced ion mobility K_0 + (normalized to standard pressure p_0 + and standard temperature T_0), from + ion mobility K. + + K_0 = K * p/p_0 * T_0/T + + [1] J. N. Dodds and E. S. Baker, + “Ion mobility spectrometry: fundamental concepts, + instrumentation, applications, and the road ahead,” + Journal of the American Society for Mass Spectrometry, + vol. 30, no. 11, pp. 2185–2195, 2019, + doi: 10.1007/s13361-019-02288-2. + + :param ion_mobility: Ion mobility K to + normalize to standard conditions + :param p_0: Standard pressure (Pa). + :param T_0: Standard temperature (K). + """ + T = self.temperature + p = self.pressure + return ion_mobility*p/p_0*T_0/T + + def reduced_im_to_im(self, reduced_ion_mobility: float, p_0: float = STANDARD_PRESSURE, T_0: float = STANDARD_TEMPERATURE): + """ + Inverse of `.im_to_reduced_im()` + """ + T = self.temperature + p = self.pressure + return reduced_ion_mobility*p_0/p*T/T_0 + + def ccs_to_reduced_im(self, ccs:float, mz:float, charge:int): + # TODO Citation + """ + Conversion of collision cross-section values (ccs) + to reduced ion mobility according to + Mason-Schamp equation. + + :param ccs: collision cross-section (ccs) + :type ccs: float + :param mz: Mass (Da) to charge ratio of peptide + :type mz: float + :param charge: Charge of peptide + :type charge: int + :return: Reduced ion mobility + :rtype: float + """ + + T = self.temperature + mass = mz*charge + μ = self.buffer_gas.mass*mass/(self.buffer_gas.mass+mass) + z = charge + + K0 = CCS_K0_CONVERSION_CONSTANT*z*1/(np.sqrt(μ*T)*ccs) + return K0 + + def reduced_im_to_ccs(self, reduced_ion_mobility:float, mz:float, charge:int): + """ + Conversion of reduced ion mobility + to collision cross-section values (ccs) + according to Mason-Schamp equation. + + :param reduced_ion_mobility: reduced ion mobility K0 + :type reduced_ion_mobility: float + :param mz: Mass (Da) to charge ratio of peptide + :type mz: float + :param charge: Charge of peptide + :type charge: int + :return: Collision cross-section (ccs) + :rtype: float + """ + + T = self.temperature + mass = mz*charge + μ = self.buffer_gas.mass*mass/(self.buffer_gas.mass+mass) + z = charge + K0 = reduced_ion_mobility + + ccs = CCS_K0_CONVERSION_CONSTANT*z*1/(np.sqrt(μ*T)*K0) + return ccs + + @abstractmethod + def run(self, sample: ProteomicsExperimentSampleSlice): + pass + + + +class TrappedIon(IonMobilitySeparation): + + def __init__(self, name:str = "TrappedIonMobilitySeparation"): + super().__init__() + self._scan_id_min = 1 + self._scan_id_max = 918 + + def run(self, sample: ProteomicsExperimentSampleSlice): + # scan apex simulation + one_over_k0, scan_apex = self._apex_model.simulate(sample, self) + # in irt and rt + sample.add_simulation("simulated_scan_apex", scan_apex) + sample.add_simulation("simulated_k0", one_over_k0) + # scan profile simulation + scan_profile = self._profile_model.simulate(sample, self) + sample.add_simulation("simulated_scan_profile", scan_profile) + + +class IonMobilityApexModel(Model): + def __init__(self): + pass + + @abstractmethod + def simulate(self, sample: ProteomicsExperimentSampleSlice, device: IonMobilitySeparation) -> NDArray[np.float64]: + return super().simulate(sample, device) + +class IonMobilityProfileModel(Model): + def __init__(self): + pass + + @abstractmethod + def simulate(self, sample: ProteomicsExperimentSampleSlice, device: IonMobilitySeparation) -> NDArray: + return super().simulate(sample, device) + +class NeuralIonMobilityApex(IonMobilityApexModel): + + def __init__(self, model_path: str, tokenizer_path: str): + super().__init__() + self.model_path = model_path + self.tokenizer = tokenizer_from_json(tokenizer_path) + + def sequences_to_tokens(self, sequences_tokenized: np.array) -> np.array: + print('tokenizing sequences...') + tokens = np.apply_along_axis(self.tokenizer.texts_to_sequences, 0, sequences_tokenized) + tokens_padded = tf.keras.preprocessing.sequence.pad_sequences(tokens, 50, padding='post') + return tokens_padded + + @staticmethod + def _worker(model_path: str, tokens_padded: np.array, mz: np.array, charges: np.array, batched: bool = True, bs: int = 2048): + + mz = np.expand_dims(mz, 1) + c = tf.one_hot(charges - 1, depth=4) + pseudo_target = np.expand_dims(np.zeros_like(tokens_padded[:, 0]), axis=1) + if batched: + dataset = tf.data.Dataset.from_tensor_slices(((mz, c, tokens_padded), pseudo_target)).batch(bs) + else: + dataset = tf.data.Dataset.from_tensor_slices(((mz, c, tokens_padded), pseudo_target)) + + print('predicting mobilities...') + model = tf.keras.models.load_model(model_path) + ccs, _ = model.predict(dataset) + return ccs + + def simulate(self, sample: ProteomicsExperimentSampleSlice, device: IonMobilitySeparation) -> Tuple[NDArray]: + data = sample.ions.merge(sample.peptides.loc[:,["pep_id","sequence_tokenized"]],on="pep_id",validate="many_to_one") + tokens = data.sequence_tokenized.apply(lambda st: st.sequence_tokenized) + tokens_padded = self.sequences_to_tokens(tokens) + mz = data['mz'].values + charges = data["charge"].values + with Pool(1) as pool: + r = pool.starmap(self._worker, [(self.model_path, tokens_padded, mz, charges)]) + ccs = r[0] + K0s = device.ccs_to_reduced_im(np.squeeze(ccs), mz, data['charge'].values) + scans = device.reduced_im_to_scan(K0s) + return K0s,scans + +class NormalIonMobilityProfileModel(IonMobilityProfileModel): + def __init__(self): + super().__init__() + + def simulate(self, sample: ProteomicsExperimentSampleSlice, device: IonMobilitySeparation) -> List[ScanProfile]: + mus = 1/sample.ions["simulated_k0"].values #1 over k0 + sigmas = gamma(a=3.703,scale=1/79.614).rvs(mus.size)/6 + profile_list = [] + for μ,σ in zip(mus,sigmas): + + model_params = { "sigma":σ, + "mu":μ, + "name":"NORMAL" + } + + normal = norm(loc=μ, scale=σ) + # start and end value (k0) + s_one_over_k0, e_one_over_k0 = normal.ppf([0.01,0.99]) + # as scan ids, remember first scans elutes largest ions + e_scan, s_scan = device.reduced_im_to_scan(1/s_one_over_k0), device.reduced_im_to_scan(1/e_one_over_k0) + + profile_scans = np.arange(s_scan-1,e_scan+1) # starting s_scan-1 is necessary here to include its end value for cdf interval + profile_end_im = 1/device.scan_im_upper(profile_scans) + profile_end_cdfs = 1-normal.cdf(profile_end_im) + profile_rel_intensities = np.diff(profile_end_cdfs) + + profile_list.append(ScanProfile(profile_scans[1:],profile_rel_intensities,model_params)) + return profile_list + + +class MzSeparation(Device): + def __init__(self, name:str = "MassSpectrometer"): + super().__init__(name) + self._model = None + self._resolution = 3 + + @property + def resolution(self): + return self._resolution + + @resolution.setter + def resolution(self, res: int): + self._resolution = res + + @property + def model(self): + return self._model + + @model.setter + def model(self, model: MzSeparationModel): + self._model = model + + @abstractmethod + def run(self, sample: ProteomicsExperimentSampleSlice): + pass + +class TOF(MzSeparation): + def __init__(self, name:str = "TimeOfFlightMassSpectrometer"): + super().__init__(name) + + def run(self, sample: ProteomicsExperimentSampleSlice): + spectra = self.model.simulate(sample, self) + sample.add_simulation("simulated_mz_spectrum", spectra) + +class MzSeparationModel(Model): + def __init__(self): + self._default_abundance = 1e4 + + @property + def default_abundance(self): + return self._default_abundance + + @default_abundance.setter + def default_abundance(self, abundance:int): + self._default_abundance = abundance + + @abstractmethod + def simulate(self, sample: ProteomicsExperimentSampleSlice, device: MzSeparation): + return super().simulate(sample, device) + +class AveragineModel(MzSeparationModel): + def __init__(self): + super().__init__() + + def simulate(self, sample: ProteomicsExperimentSampleSlice, device: MzSeparation): + + avg = AveragineGenerator() + # right join here to keep order of keys in right df + joined_df = pd.merge(sample.peptides[["pep_id","mass_theoretical"]], + sample.ions[["pep_id","charge"]], + how = "right", + on = "pep_id") + masses = joined_df["mass_theoretical"].values + charges = joined_df["charge"].values + spectra = [] + for (m, c) in zip(masses, charges): + spectrum = avg.generate_spectrum(m,c,-1,-1,amp = self.default_abundance, centroided=False) + spectra.append(spectrum) + return spectra \ No newline at end of file diff --git a/imspy/imspy/slice.py b/imspy/imspy/slice.py index da161409..056febaa 100644 --- a/imspy/imspy/slice.py +++ b/imspy/imspy/slice.py @@ -7,9 +7,15 @@ from imspy.utilities import re_index_indices +<<<<<<<< HEAD:imspy/imspy/slice.py import imspy_connector as pims from imspy.frame import TimsFrame, TimsFrameVectorized from imspy.spectrum import MzSpectrum +======== +import pyims_connector as pims +from pyims.data.frame import TimsFrame, TimsFrameVectorized +from pyims.data.spectrum import MzSpectrum +>>>>>>>> 1cd02c1 (transfer simulation framework from proteolizard):imspy/imspy/data/slice.py class TimsSlice: diff --git a/imspy/imspy/utility.py b/imspy/imspy/utility.py new file mode 100644 index 00000000..001c0246 --- /dev/null +++ b/imspy/imspy/utility.py @@ -0,0 +1,508 @@ +import io +import json +from typing import List, Optional +import tensorflow as tf +import numpy as np +from numpy.typing import ArrayLike + +import numba +import math +import pandas as pd + +from tqdm import tqdm +from pyims.data import MzSpectrum +# TODO bring to pyims +#from proteolizardalgo.hashing import ReferencePattern + + +class TokenSequence: + + def __init__(self, sequence_tokenized: Optional[List[str]] = None, jsons:Optional[str] = None): + if jsons is not None: + self.sequence_tokenized = self._from_jsons(jsons) + self._jsons = jsons + else : + self.sequence_tokenized = sequence_tokenized + self._jsons = self._to_jsons() + + def _to_jsons(self): + json_dict = self.sequence_tokenized + return json.dumps(json_dict) + + def _from_jsons(self, jsons:str): + return json.loads(jsons) + + @property + def jsons(self): + return self._jsons + +def proteome_from_fasta(path: str) -> pd.DataFrame: + """ + Read a fasta file and return a dataframe with the protein name and sequence. + :param path: path to the fasta file + :return: a dataframe with the protein name and sequence + """ + d = {} + with open(path) as infile: + gene = '' + header = '' + for i, line in enumerate(infile): + if line.find('>') == -1: + gene += line + elif line.find('>') != -1 and i > 0: + header = line + d[header.replace('\n', '')[4:]] = gene.replace('\n', '') + gene = '' + elif i == 0: + header = line + + row_list = [] + + for key, value in d.items(): + split_index = key.find(' ') + gene_id = key[:split_index].split('|')[0] + rest = key[split_index:] + row = {'id': gene_id, 'meta_data': rest, 'sequence': value} + row_list.append(row) + + return pd.DataFrame(row_list) + + +def peak_width_preserving_mz_transform( + mz: np.array, + M0: float = 500, + resolution: float = 50_000): + """ + Transform values into an index that fixes the width of the peak at half full width. + Arguments: + mz (np.array): An array of mass to charge ratios to transform. + M0 (float): The m/z value at which TOFs resolution is reported. + resolution (float): The resolution of the TOF instrument. + """ + return (np.log(mz) - np.log(M0)) / np.log1p(1 / resolution) + + +def bins_to_mz(mz_bin, win_len): + return np.abs(mz_bin) * win_len + (int(mz_bin < 0) * (0.5 * win_len)) + + +def get_ref_pattern_as_spectra(ref_patterns): + """ + Get the reference patterns as a list of spectra. + :param ref_patterns: the reference patterns + :return: a list of spectra + """ + spec_list = [] + + for index, row in ref_patterns.iterrows(): + + if 150 <= (row['m'] + 1) / row['z'] <= 1700: + last_contrib = row['last_contrib'] + + pattern = row['pattern'][:1000] + + mz_bin = np.arange(1000) + + both = list(zip(mz_bin, pattern)) + + both = list(filter(lambda x: x[1] >= 0.001, both)) + + mz = [x[0] for x in both] + i = [x[1] for x in both] + + first_peak = np.where(np.diff(mz) > 1)[0][0] + max_i = np.argmax(i[:first_peak]) + mono_mz_max = mz[max_i] + + mz = (mz - mono_mz_max) / 100 + row['m'] / row['z'] + + mz = np.round(np.array(mz), 2) + 1.0 + i = np.array([int(x) for x in i]) + + spectum = MzSpectrum(None, -1, -1, mz, i) + spec_list.append((row['m'], row['z'], spectum, last_contrib)) + + return spec_list + + +def get_refspec_list(ref_patterns, win_len=5): + """ + + """ + + d_list = [] + + for spec in tqdm(ref_patterns, desc='creating reference patterns', ncols=100): + m = spec[0] + z = spec[1] + sp = spec[2] + lc = spec[3] + peak_width = 0.04 + mz_mono = np.round((m + 1.001) / z, 2) + peak_width + mz_bin = np.floor(mz_mono / win_len) + ref_spec = ReferencePattern(mz_bin=mz_bin, mz_mono=mz_mono, charge=z, mz_last_contrib=lc, + spectrum=sp, window_length=win_len) + + d_list.append((mz_bin, ref_spec)) + + return d_list + + +def create_reference_dict(D): + # create members + keys = set([t[0] for t in D]) + ref_d = dict([(k, []) for k in keys]) + + for b, p in D: + ref_d[b].append(p) + + tmp_dict = {} + # go over all keys + for key, values in ref_d.items(): + + # create charges + r = np.arange(5) + 1 + + # go over all charge states + for c in r: + + # go over all reference pattern + for ca in values: + + # append exactly one charge state per reference bin + if c == ca.charge: + # if key already exists, append + if key in tmp_dict: + tmp_dict[key].append(ca) + break + + # if key does not exist, create new list + else: + tmp_dict[key] = [ca] + break + + return tmp_dict + +@numba.jit(nopython=True) +def normal_pdf(x: ArrayLike, mass: float, s: float = 0.001, inv_sqrt_2pi: float = 0.3989422804014327, normalize: bool = False): + """ + :param inv_sqrt_2pi: + :param x: + :param mass: + :param s: + :return: + """ + a = (x - mass) / s + if normalize: + return np.exp(-0.5 * np.power(a,2)) + else: + return inv_sqrt_2pi / s * np.exp(-0.5 * np.power(a,2)) + + +@numba.jit +def gaussian(x, μ=0, σ=1): + """ + Gaussian function + :param x: + :param μ: + :param σ: + :return: + """ + A = 1 / np.sqrt(2 * np.pi * np.power(σ, 2)) + B = np.exp(- (np.power(x - μ, 2) / 2 * np.power(σ, 2))) + + return A * B + + +@numba.jit +def exp_distribution(x, λ=1): + """ + Exponential function + :param x: + :param λ: + :return: + """ + if x > 0: + return λ * np.exp(-λ * x) + return 0 + + +@numba.jit +def exp_gaussian(x, μ=-3, σ=1, λ=.25): + """ + laplacian distribution with exponential decay + :param x: + :param μ: + :param σ: + :param λ: + :return: + """ + A = λ / 2 * np.exp(λ / 2 * (2 * μ + λ * np.power(σ, 2) - 2 * x)) + B = math.erfc((μ + λ * np.power(σ, 2) - x) / (np.sqrt(2) * σ)) + return A * B + + +class NormalDistribution: + def __init__(self, μ: float, σ: float): + self.μ = μ + self.σ = σ + + def __call__(self, x): + return gaussian(x, self.μ, self.σ) + + +class ExponentialGaussianDistribution: + def __init__(self, μ: float = -3, σ: float = 1, λ: float = .25): + self.μ = μ + self.σ = σ + self.λ = λ + + def __call__(self, x): + return exp_gaussian(x, self.μ, self.σ, self.λ) + + +def preprocess_max_quant_evidence(exp: pd.DataFrame) -> pd.DataFrame: + """ + select columns from evidence txt, rename to ionmob naming convention and transform to raw data rt in seconds + Args: + exp: a MaxQuant evidence dataframe from evidence.txt table + Returns: cleaned evidence dataframe, columns renamed to ionmob naming convention + """ + + # select columns + exp = exp[['m/z', 'Mass', 'Charge', 'Modified sequence', 'Retention time', + 'Retention length', 'Ion mobility index', 'Ion mobility length', '1/K0', '1/K0 length', + 'Number of isotopic peaks', 'Max intensity m/z 0', 'Intensity', 'Raw file', 'CCS']].rename( + # rename columns to ionmob naming convention + columns={'m/z': 'mz', 'Mass': 'mass', + 'Charge': 'charge', 'Modified sequence': 'sequence', 'Retention time': 'rt', + 'Retention length': 'rt_length', 'Ion mobility index': 'im', 'Ion mobility length': 'im_length', + '1/K0': 'inv_ion_mob', '1/K0 length': 'inv_ion_mob_length', 'CCS': 'ccs', + 'Number of isotopic peaks': 'num_peaks', 'Max intensity m/z 0': 'mz_max_intensity', + 'Intensity': 'intensity', 'Raw file': 'raw'}).dropna() + + # transform retention time from minutes to seconds as stored in tdf raw data + exp['rt'] = exp.apply(lambda r: r['rt'] * 60, axis=1) + exp['rt_length'] = exp.apply(lambda r: r['rt_length'] * 60, axis=1) + exp['rt_start'] = exp.apply(lambda r: r['rt'] - r['rt_length'] / 2, axis=1) + exp['rt_stop'] = exp.apply(lambda r: r['rt'] + r['rt_length'] / 2, axis=1) + + exp['im_start'] = exp.apply(lambda r: int(np.round(r['im'] - r['im_length'] / 2)), axis=1) + exp['im_stop'] = exp.apply(lambda r: int(np.round(r['im'] + r['im_length'] / 2)), axis=1) + + exp['inv_ion_mob_start'] = exp.apply(lambda r: r['inv_ion_mob'] - r['inv_ion_mob_length'] / 2, axis=1) + exp['inv_ion_mob_stop'] = exp.apply(lambda r: r['inv_ion_mob'] + r['inv_ion_mob_length'] / 2, axis=1) + + # remove duplicates + exp = exp.drop_duplicates(['sequence', 'charge', 'rt', 'ccs']) + + return exp + +def max_quant_to_proforma(s:str, old_annotation:bool=False): + """ + Convert MaxQuant amino acid sequences (with PTM) to + proForma format. + + :param s: amino acid sequence + :type s: str + :param old_annotation: Wether old annotation is used in `s`, defaults to False + :type old_annotation: bool, optional + """ + seq = s.strip("_") + + proforma_seq = "" + if old_annotation: + proforma_seq = (seq + .replace("(ox)", "[UNIMOD:35]") + .replace("(ac)", "[UNIMOD:1]") + ) + else: + proforma_seq = (seq + .replace("(Oxidation (M))", "[UNIMOD:35]") + .replace("(Phospho (STY))", "[UNIMOD:21]") + .replace("(Acetyl (Protein N-term))", "[UNIMOD:1]") + ) + return f"_{proforma_seq}_" + +def preprocess_max_quant_sequence(s, old_annotation=False): + """ + :param s: + :param old_annotation: + """ + + seq = s[1:-1] + + is_acc = False + + if old_annotation: + seq = seq.replace('(ox)', '$') + + if seq.find('(ac)') != -1: + is_acc = True + seq = seq.replace('(ac)', '') + + else: + seq = seq.replace('(Oxidation (M))', '$') + seq = seq.replace('(Phospho (STY))', '&') + + if seq.find('(Acetyl (Protein N-term))') != -1: + is_acc = True + seq = seq.replace('(Acetyl (Protein N-term))', '') + + # form list from string + slist = list(seq) + + tmp_list = [] + + for item in slist: + if item == '$': + tmp_list.append('') + + elif item == '&': + tmp_list.append('') + + else: + tmp_list.append(item) + + slist = tmp_list + + r_list = [] + + for i, char in enumerate(slist): + + if char == '': + C = slist[i - 1] + C = C + '-' + r_list = r_list[:-1] + r_list.append(C) + + elif char == 'C': + r_list.append('C-') + + elif char == '': + C = slist[i - 1] + C = C + '-' + r_list = r_list[:-1] + r_list.append(C) + + else: + r_list.append(char) + + if is_acc: + return ['-'] + r_list + [''] + + return [''] + r_list + [''] + +def is_unimod_start(char:str): + """ + Tests if char is start of unimod + bracket + + :param char: Character of a proForma formatted aa sequence + :type char: str + :return: Wether char is start of unimod bracket + :rtype: bool + """ + if char in ["(","[","{"]: + return True + else: + return False + +def is_unimod_end(char:str): + """ + Tests if char is end of unimod + bracket + + :param char: Character of a proForma formatted aa sequence + :type char: str + :return: Wether char is end of unimod bracket + :rtype: bool + """ + if char in [")","]","}"]: + return True + else: + return False + +def tokenize_proforma_sequence(sequence: str): + """ + Tokenize a ProForma formatted sequence string. + + :param sequence: Sequence string (ProForma formatted) + :type sequence: str + :return: List of tokens + :rtype: List + """ + sequence = sequence.upper().replace("(","[").replace(")","]") + token_list = [""] + in_unimod_bracket = False + tmp_token = "" + + for aa in sequence: + if is_unimod_start(aa): + in_unimod_bracket = True + if in_unimod_bracket: + if is_unimod_end(aa): + in_unimod_bracket = False + tmp_token += aa + continue + if tmp_token != "": + token_list.append(tmp_token) + tmp_token = "" + tmp_token += aa + + if tmp_token != "": + token_list.append(tmp_token) + + if len(token_list) > 1: + if token_list[1].find("UNIMOD:1") != -1: + token_list[1] = ""+token_list[1] + token_list = token_list[1:] + token_list.append("") + + return token_list + +def get_aa_num_proforma_sequence(sequence:str): + """ + get number of amino acids in sequence + + :param sequence: proforma formatted aa sequence + :type sequence: str + :return: Number of amino acids + :rtype: int + """ + num_aa = 0 + inside_bracket = False + + for aa in sequence: + if is_unimod_start(aa): + inside_bracket = True + if inside_bracket: + if is_unimod_end(aa): + inside_bracket = False + continue + num_aa += 1 + return num_aa + + + +def tokenizer_to_json(tokenizer: tf.keras.preprocessing.text.Tokenizer, path: str): + """ + save a fit keras tokenizer to json for later use + :param tokenizer: fit keras tokenizer to save + :param path: path to save json to + """ + tokenizer_json = tokenizer.to_json() + with io.open(path, 'w', encoding='utf-8') as f: + f.write(json.dumps(tokenizer_json, ensure_ascii=False)) + + +def tokenizer_from_json(path: str): + """ + load a pre-fit tokenizer from a json file + :param path: path to tokenizer as json file + :return: a keras tokenizer loaded from json + """ + with open(path) as f: + data = json.load(f) + return tf.keras.preprocessing.text.tokenizer_from_json(data) + diff --git a/pyims/examples/simulation/run_example_simulation.py b/pyims/examples/simulation/run_example_simulation.py new file mode 100644 index 00000000..03f01c71 --- /dev/null +++ b/pyims/examples/simulation/run_example_simulation.py @@ -0,0 +1,104 @@ +from pyims.simulation.experiment import LcImsMsMs +from pyims.simulation.hardware_models import (NeuralChromatographyApex, + NormalChromatographyProfileModel, + LiquidChromatography, + ElectroSpray, + TrappedIon, + TOF, + NeuralIonMobilityApex, + NormalIonMobilityProfileModel, + AveragineModel, + BinomialIonSource + ) +from pyims.proteome import ProteinSample, Trypsin, ORGANISM +from pyims.chemistry import BufferGas + +import pandas as pd +import numpy as np + +def irt_to_rt(irt): + return irt + +def scan_im_interval(scan_id): + intercept = 1451.357 + slope = -877.361 + scan_id = np.atleast_1d(scan_id) + lower = ( scan_id - intercept ) / slope + upper = ((scan_id+1) - intercept ) / slope + return np.stack([1/lower, 1/upper], axis=1) + +def im_to_scan(reduced_ion_mobility): + intercept = 1451.357 + slope = -877.361 + # TODO more appropriate function here ? + one_over_k0 = 1/reduced_ion_mobility + return np.round(one_over_k0 * slope + intercept).astype(np.int16) + +def build_experiment(): + t = LcImsMsMs("./timstofexp1_binomial_ion_source_21_7/") # maybe rather call this class LCIMSMSExperiment + + lc = LiquidChromatography() + lc.frame_length = 1200 #ms + lc.gradient_length = 120 # min + esi = ElectroSpray() + tims = TrappedIon() + tims.scan_time = 110 # us + tof_mz = TOF() + + t.lc_method = lc + t.ionization_method = esi + t.ion_mobility_separation_method = tims + t.mz_separation_method = tof_mz + + N2 = BufferGas("N2") + + tokenizer_path = '/home/tim/Workspaces/ionmob/pretrained-models/tokenizers/tokenizer.json' + + rt_model_weights = "/home/tim/Workspaces/Resources/models/DeepChromatograpy/" + t.lc_method.apex_model = NeuralChromatographyApex(rt_model_weights,tokenizer_path = tokenizer_path) + + t.lc_method.profile_model = NormalChromatographyProfileModel() + t.lc_method.irt_to_rt_converter = irt_to_rt + + + + + + im_model_weights = "/home/tim/Workspaces/ionmob/pretrained-models/GRUPredictor" + t.ion_mobility_separation_method.apex_model = NeuralIonMobilityApex(im_model_weights, tokenizer_path = tokenizer_path) + + t.ion_mobility_separation_method.profile_model = NormalIonMobilityProfileModel() + t.ion_mobility_separation_method.buffer_gas = N2 + t.ion_mobility_separation_method.scan_to_reduced_im_interval_converter = scan_im_interval + t.ion_mobility_separation_method.reduced_im_to_scan_converter = im_to_scan + + t.ionization_method.ionization_model = BinomialIonSource() + + + t.mz_separation_method.model = AveragineModel() + + + rng = np.random.default_rng(2023) + # read proteome + proteome = pd.read_feather('/home/tim/Workspaces/Resources/Homo-sapiens-proteome.feather') + random_abundances = rng.integers(1e3,1e7,size=proteome.shape[0]) + proteome = proteome.assign(abundancy= random_abundances) + # create sample and sample digest; TODO: add missed cleavages to ENZYMEs + sample = ProteinSample(proteome, ORGANISM.HOMO_SAPIENS) + sample_digest = sample.digest(Trypsin()) + + + # to reduce computational load in example + sample_digest.data = sample_digest.data.sample(100000, random_state= rng) + + + t.load_sample(sample_digest) + return t + +if __name__ == "__main__": + + + t = build_experiment() + + #cProfile.run("t.run(10000)", filename="profiler_10000_8_process",sort="cumtime") + t.run(100,frames_per_assemble_process=10) \ No newline at end of file diff --git a/pyims/pyims/chemistry.py b/pyims/pyims/chemistry.py new file mode 100644 index 00000000..b5b7d6b3 --- /dev/null +++ b/pyims/pyims/chemistry.py @@ -0,0 +1,213 @@ +import numpy as np +import mendeleev as me + +AMINO_ACIDS = {'Lysine': 'K', 'Alanine': 'A', 'Glycine': 'G', 'Valine': 'V', 'Tyrosine': 'Y', + 'Arginine': 'R', 'Glutamic Acid': 'E', 'Phenylalanine': 'F', 'Tryptophan': 'W', + 'Leucine': 'L', 'Threonine': 'T', 'Cysteine': 'C', 'Serine': 'S', 'Glutamine': 'Q', + 'Methionine': 'M', 'Isoleucine': 'I', 'Asparagine': 'N', 'Proline': 'P', 'Histidine': 'H', + 'Aspartic Acid': 'D'} + +AA_MASSES = {'A': 71.03711, 'C': 103.00919, 'D': 115.02694, 'E': 129.04259, 'F': 147.06841, 'G': 57.02146, + 'H': 137.05891, 'I': 113.08406, 'K': 128.09496, 'L': 113.08406, 'M': 131.04049, 'N': 114.04293, + 'P': 97.05276, 'Q': 128.05858, 'R': 156.10111, 'S': 87.03203, 'T': 101.04768, 'V': 99.06841, + 'W': 186.07931, 'Y': 163.06333, '[UNIMOD:1]': 42.010565, '[UNIMOD:35]': 15.994915, 'U': 168.964203, + '[UNIMOD:4]': 57.021464, '[UNIMOD:21]': 79.966331, '[UNIMOD:312]': 119.004099, '': 0.0, '': 0.0} + +VARIANT_DICT = {'L': ['L'], 'E': ['E'], 'S': ['S', 'S[UNIMOD:21]'], 'A': ['A'], 'V': ['V'], 'D': ['D'], 'G': ['G'], + '': [''], 'P': ['P'], '': ['', '[UNIMOD:1]'], 'T': ['T', 'T[UNIMOD:21]'], + 'I': ['I'], 'Q': ['Q'], 'K': ['K', 'K[UNIMOD:1]'], 'N': ['N'], 'R': ['R'], 'F': ['F'], 'H': ['H'], + 'Y': ['Y', 'Y[UNIMOD:21]'], 'M': ['M', 'M[UNIMOD:35]'], + 'W': ['W'], 'C': ['C', 'C[UNIMOD:312]', 'C[UNIMOD:4]'], 'C[UNIMOD:4]': ['C', 'C[UNIMOD:312]', 'C[UNIMOD:4]']} + +MASS_PROTON = 1.007276466583 + +MASS_WATER = 18.010564684 +# IUPAC standard in Kelvin +STANDARD_TEMPERATURE = 273.15 +# IUPAC standard in Pa +STANDARD_PRESSURE = 1e5 +# IUPAC elementary charge +ELEMENTARY_CHARGE = 1.602176634e-19 +# IUPAC BOLTZMANN'S CONSTANT +K_BOLTZMANN = 1.380649e-23 +# constant part of Mason-Schamp equation +# 3/16*sqrt(2π/kb)*e/N0 * +# 1e20 (correction for using A² instead of m²) * +# 1/sqrt(1.660 5402(10)×10−27 kg) (correction for using Da instead of kg) * +# 10000 * (to get cm²/Vs from m²/Vs) +# TODO CITATION +CCS_K0_CONVERSION_CONSTANT = 18509.8632163405 + +def get_monoisotopic_token_weight(token:str): + """ + Gets monoisotopic weight of token + + :param token: Token of aa sequence e.g. "[UNIMOD:1]" + :type token: str + :return: Weight in Dalton. + :rtype: float + """ + splits = token.split("[") + for i in range(1, len(splits)): + splits[i] = "["+splits[i] + + mass = 0 + for split in splits: + mass += AA_MASSES[split] + return mass + + +def get_mono_isotopic_weight(sequence_tokenized: list[str]) -> float: + mass = 0 + for token in sequence_tokenized: + mass += get_monoisotopic_token_weight(token) + return mass + MASS_WATER + + +def get_mass_over_charge(mass: float, charge: int) -> float: + return (mass / charge) + MASS_PROTON + +def get_num_protonizable_sites(sequence: str) -> int: + """ + Gets number of sites that can be protonized. + This function does not yet account for PTMs. + + :param sequence: Amino acid sequence + :type sequence: str + :return: Number of protonizable sites + :rtype: int + """ + sites = 1 # n-terminus + for s in sequence: + if s in ["H","R","K"]: + sites += 1 + return sites + + +def reduced_mobility_to_ccs(one_over_k0, mz, charge, mass_gas=28.013, temp=31.85, t_diff=273.15): + """ + convert reduced ion mobility (1/k0) to CCS + :param one_over_k0: reduced ion mobility + :param charge: charge state of the ion + :param mz: mass-over-charge of the ion + :param mass_gas: mass of drift gas + :param temp: temperature of the drift gas in C° + :param t_diff: factor to translate from C° to K + """ + reduced_mass = (mz * charge * mass_gas) / (mz * charge + mass_gas) + return (CCS_K0_CONVERSION_CONSTANT * charge) / (np.sqrt(reduced_mass * (temp + t_diff)) * 1 / one_over_k0) + + +def ccs_to_one_over_reduced_mobility(ccs, mz, charge, mass_gas=28.013, temp=31.85, t_diff=273.15): + """ + convert CCS to 1 over reduced ion mobility (1/k0) + :param ccs: collision cross-section + :param charge: charge state of the ion + :param mz: mass-over-charge of the ion + :param mass_gas: mass of drift gas (N2) + :param temp: temperature of the drift gas in C° + :param t_diff: factor to translate from C° to K + """ + reduced_mass = (mz * charge * mass_gas) / (mz * charge + mass_gas) + return ((np.sqrt(reduced_mass * (temp + t_diff))) * ccs) / (CCS_K0_CONVERSION_CONSTANT * charge) + + +class ChemicalCompound: + + def _calculate_molecular_mass(self): + mass = 0 + for (atom, abundance) in self.element_composition.items(): + mass += me.element(atom).atomic_weight * abundance + return mass + + def __init__(self, formula): + self.element_composition = self.get_composition(formula) + self.mass = self._calculate_molecular_mass() + + def get_composition(self, formula:str): + """ + Parse chemical formula into Dict[str:int] with + atoms as keys and the respective counts as values. + + :param formula: Chemical formula of compound e.g. 'C6H12O6' + :type formula: str + :return: Dictionary Atom: Count + :rtype: Dict[str:int] + """ + if formula.startswith("("): + assert formula.endswith(")") + formula = formula[1:-1] + + tmp_group = "" + tmp_group_count = "" + depth = 0 + comp_list = [] + comp_counts = [] + + # extract components: everything in brackets and atoms + # extract component counts: number behind component or 1 + for (i,e) in enumerate(formula): + if e == "(": + depth += 1 + if depth == 1: + if tmp_group != "": + comp_list.append(tmp_group) + tmp_group = "" + if tmp_group_count == "": + comp_counts.append(1) + else: + comp_counts.append(int(tmp_group_count)) + tmp_group_count = "" + tmp_group += e + continue + if e == ")": + depth -= 1 + tmp_group += e + continue + if depth > 0: + tmp_group += e + continue + if e.isupper(): + if tmp_group != "": + comp_list.append(tmp_group) + tmp_group = "" + if tmp_group_count == "": + comp_counts.append(1) + else: + comp_counts.append(int(tmp_group_count)) + tmp_group_count = "" + tmp_group += e + continue + if e.islower(): + tmp_group += e + continue + if e.isnumeric(): + tmp_group_count += e + if tmp_group != "": + comp_list.append(tmp_group) + if tmp_group_count == "": + comp_counts.append(1) + else: + comp_counts.append(int(tmp_group_count)) + + # assemble dictionary from component lists + atom_dict = {} + for (comp,count) in zip(comp_list,comp_counts): + if not comp.startswith("("): + atom_dict[comp] = count + else: + atom_dicts_depth = self.get_composition(comp) + for atom in atom_dicts_depth: + atom_dicts_depth[atom] *= count + if atom in atom_dict: + atom_dict[atom] += atom_dicts_depth[atom] + else: + atom_dict[atom] = atom_dicts_depth[atom] + atom_dicts_depth = {} + return atom_dict + +class BufferGas(ChemicalCompound): + + def __init__(self, formula: str): + super().__init__(formula) + From 6d2f431e9a3999a0bda1259330d5316d08e6acc6 Mon Sep 17 00:00:00 2001 From: "Tim Maier (Ubuntu Desktop)" Date: Fri, 27 Oct 2023 11:39:43 +0200 Subject: [PATCH 2/7] prevent circular import and fix tokenizer error --- imspy/imspy/data/frame.py | 2 +- imspy/imspy/data/handle.py | 10 ++------- imspy/imspy/data/spectrum.py | 22 ++++++++++++++----- imspy/imspy/feature.py | 2 +- imspy/imspy/simulation/hardware_models.py | 8 +++---- .../simulation/run_example_simulation.py | 2 +- 6 files changed, 26 insertions(+), 20 deletions(-) diff --git a/imspy/imspy/data/frame.py b/imspy/imspy/data/frame.py index 659209b0..6cd4ba17 100644 --- a/imspy/imspy/data/frame.py +++ b/imspy/imspy/data/frame.py @@ -7,7 +7,7 @@ import numpy as np import imspy_connector as pims -from imspy.spectrum import MzSpectrum, TimsSpectrum, IndexedMzSpectrum +from imspy.data.spectrum import MzSpectrum, TimsSpectrum, IndexedMzSpectrum from imspy.utilities import re_index_indices diff --git a/imspy/imspy/data/handle.py b/imspy/imspy/data/handle.py index 4aec5145..2a5f052c 100644 --- a/imspy/imspy/data/handle.py +++ b/imspy/imspy/data/handle.py @@ -10,14 +10,8 @@ from abc import ABC -<<<<<<<< HEAD:imspy/imspy/handle.py -from imspy.frame import TimsFrame -from imspy.slice import TimsSlice -======== -from pyims.data import TimsFrame, TimsSlice - ->>>>>>>> 1cd02c1 (transfer simulation framework from proteolizard):imspy/imspy/data/handle.py - +from imspy.data.frame import TimsFrame +from imspy.data.slice import TimsSlice class TimsDataset(ABC): def __init__(self, data_path: str): diff --git a/imspy/imspy/data/spectrum.py b/imspy/imspy/data/spectrum.py index 80baa757..c561b1bb 100644 --- a/imspy/imspy/data/spectrum.py +++ b/imspy/imspy/data/spectrum.py @@ -1,6 +1,6 @@ import numpy as np from typing import List, Tuple - +from __future__ import annotations import pandas as pd from numpy.typing import NDArray @@ -138,7 +138,7 @@ def __repr__(self): return f"MzSpectrum(num_peaks={len(self.mz)})" def to_windows(self, window_length: float = 10, overlapping: bool = True, min_num_peaks: int = 5, - min_intensity: float = 1) -> Tuple[NDArray, List['MzSpectrum']]: + min_intensity: float = 1) -> Tuple[NDArray, List[MzSpectrum]]: """Convert the spectrum to a list of windows. Args: @@ -154,8 +154,20 @@ def to_windows(self, window_length: float = 10, overlapping: bool = True, min_nu indices, windows = self.__spec_ptr.to_windows(window_length, overlapping, min_num_peaks, min_intensity) return indices, [MzSpectrum.from_py_mz_spectrum(window) for window in windows] + def to_resolution(self, resolution: int) -> MzSpectrum: + """Bins the spectrum's m/z values to a + given resolution and sums the intensities. + + Args: + resolution (int): Negative decadic logarithm of bin size. + + Returns: + MzSpectrum: A new `MzSpectrum` where m/z values are binned according to the given resolution. + """ + return self.__spec_ptr.to_resolution(resolution) + def filter(self, mz_min: float = 0.0, mz_max: float = 2000.0, intensity_min: float = 0.0, - intensity_max: float = 1e9) -> 'MzSpectrum': + intensity_max: float = 1e9) -> MzSpectrum: """Filter the spectrum for a given m/z range and intensity range. Args: @@ -170,7 +182,7 @@ def filter(self, mz_min: float = 0.0, mz_max: float = 2000.0, intensity_min: flo return MzSpectrum.from_py_mz_spectrum( self.__spec_ptr.filter_ranged(mz_min, mz_max, intensity_min, intensity_max)) - def vectorized(self, resolution: int = 2) -> 'MzSpectrumVectorized': + def vectorized(self, resolution: int = 2) -> MzSpectrumVectorized: """Convert the spectrum to a vectorized spectrum. Args: @@ -213,7 +225,7 @@ def from_py_mz_spectrum_vectorized(cls, spec: pims.PyMzSpectrumVectorized): @property def resolution(self) -> float: """Resolution. - + Returns: float: Resolution. """ diff --git a/imspy/imspy/feature.py b/imspy/imspy/feature.py index 9abf5c5c..2124b5f8 100644 --- a/imspy/imspy/feature.py +++ b/imspy/imspy/feature.py @@ -4,7 +4,7 @@ import json -from pyims.data import TimsSlice, TimsFrame, MzSpectrum +from pyims.data import TimsSlice, TimsFrame from pyims.utility import gaussian, exp_gaussian from pyims.isotopes import IsotopePatternGenerator, create_initial_feature_distribution from abc import ABC, abstractmethod diff --git a/imspy/imspy/simulation/hardware_models.py b/imspy/imspy/simulation/hardware_models.py index 817d9797..2a0b2811 100644 --- a/imspy/imspy/simulation/hardware_models.py +++ b/imspy/imspy/simulation/hardware_models.py @@ -267,7 +267,7 @@ def __init__(self, model_path: str, tokenizer_path: str): def sequences_to_tokens(self, sequences_tokenized: np.array) -> np.array: print('tokenizing sequences...') - tokens = np.apply_along_axis(self.tokenizer.texts_to_sequences, 0, sequences_tokenized) + tokens = self.tokenizer.texts_to_sequences(sequences_tokenized) tokens_padded = tf.keras.preprocessing.sequence.pad_sequences(tokens, 50, padding='post') return tokens_padded @@ -286,7 +286,7 @@ def _worker(model_path: str, tokens_padded: np.array, batched: bool = True, bs: def simulate(self, sample: ProteomicsExperimentSampleSlice, device: Chromatography) -> NDArray[np.float64]: data = sample.peptides - tokens = data["sequence_tokenized"].apply(lambda st: st.sequence_tokenized) + tokens = data["sequence_tokenized"].apply(lambda st: st.sequence_tokenized).to_numpy() print('generating tf dataset...') tokens_padded = self.sequences_to_tokens(tokens) @@ -632,7 +632,7 @@ def __init__(self, model_path: str, tokenizer_path: str): def sequences_to_tokens(self, sequences_tokenized: np.array) -> np.array: print('tokenizing sequences...') - tokens = np.apply_along_axis(self.tokenizer.texts_to_sequences, 0, sequences_tokenized) + tokens = self.tokenizer.texts_to_sequences(sequences_tokenized) tokens_padded = tf.keras.preprocessing.sequence.pad_sequences(tokens, 50, padding='post') return tokens_padded @@ -654,7 +654,7 @@ def _worker(model_path: str, tokens_padded: np.array, mz: np.array, charges: np. def simulate(self, sample: ProteomicsExperimentSampleSlice, device: IonMobilitySeparation) -> Tuple[NDArray]: data = sample.ions.merge(sample.peptides.loc[:,["pep_id","sequence_tokenized"]],on="pep_id",validate="many_to_one") - tokens = data.sequence_tokenized.apply(lambda st: st.sequence_tokenized) + tokens = data.sequence_tokenized.apply(lambda st: st.sequence_tokenized).to_numpy() tokens_padded = self.sequences_to_tokens(tokens) mz = data['mz'].values charges = data["charge"].values diff --git a/pyims/examples/simulation/run_example_simulation.py b/pyims/examples/simulation/run_example_simulation.py index 03f01c71..1df9cef0 100644 --- a/pyims/examples/simulation/run_example_simulation.py +++ b/pyims/examples/simulation/run_example_simulation.py @@ -89,7 +89,7 @@ def build_experiment(): # to reduce computational load in example - sample_digest.data = sample_digest.data.sample(100000, random_state= rng) + sample_digest.data = sample_digest.data.sample(100, random_state= rng) t.load_sample(sample_digest) From 0f93ebf18ab104349d5fd1e56c668da5576564bb Mon Sep 17 00:00:00 2001 From: "Tim Maier (Ubuntu Desktop)" Date: Fri, 3 Nov 2023 10:29:47 +0100 Subject: [PATCH 3/7] Changed calling of MzSpectrum --- imspy/imspy/data/spectrum.py | 30 +++++++++++++++++++++++++-- imspy/imspy/isotopes.py | 16 +++++++------- imspy/imspy/noise.py | 4 ++-- imspy/imspy/simulation/experiment.py | 1 + imspy/imspy/utility.py | 4 ++-- imspy/pyproject.toml | 7 +++++++ imspy_connector/src/py_mz_spectrum.rs | 4 ++++ 7 files changed, 51 insertions(+), 15 deletions(-) diff --git a/imspy/imspy/data/spectrum.py b/imspy/imspy/data/spectrum.py index c561b1bb..cb6c7559 100644 --- a/imspy/imspy/data/spectrum.py +++ b/imspy/imspy/data/spectrum.py @@ -1,6 +1,7 @@ +from __future__ import annotations +import json import numpy as np from typing import List, Tuple -from __future__ import annotations import pandas as pd from numpy.typing import NDArray @@ -79,6 +80,18 @@ def __repr__(self): class MzSpectrum: + + @classmethod + def from_jsons(cls, jsons:str) -> MzSpectrum: + json_dict:dict = json.loads(jsons) + mz = json_dict["mz"] + intensity = json_dict["intensity"] + return cls(mz, intensity) + + @classmethod + def from_spectra_list(cls, spectra_list:List[MzSpectrum], resolution: int, centroided: bool)->MzSpectrum: + pass + def __init__(self, mz: NDArray[np.float64], intensity: NDArray[np.float64]): """MzSpectrum class. @@ -192,8 +205,18 @@ def vectorized(self, resolution: int = 2) -> MzSpectrumVectorized: MzSpectrumVectorized: Vectorized spectrum. """ return MzSpectrumVectorized.from_py_mz_spectrum_vectorized(self.__spec_ptr.vectorized(resolution)) + + def to_jsons(self) -> str: + """ + generates json string representation of MzSpectrum + """ + json_dict = {} + json_dict["mz"] = self.mz().tolist() + json_dict["intensity"] = self.intensity().tolist() + return json.dumps(json_dict) + class MzSpectrumVectorized: def __init__(self, indices: NDArray[np.int32], values: NDArray[np.float64], resolution: int): """MzSpectrum class. @@ -253,7 +276,7 @@ def __repr__(self): return f"MzSpectrumVectorized(num_values={len(self.values)})" -class TimsSpectrum: +class TimsSpectrum(AbstractSpectrum): def __init__(self, frame_id: int, scan: int, retention_time: float, mobility: float, ms_type: int, index: NDArray[np.int32], mz: NDArray[np.float64], intensity: NDArray[np.float64]): """TimsSpectrum class. @@ -387,3 +410,6 @@ def __repr__(self): return (f"TimsSpectrum(id={self.frame_id}, retention_time={np.round(self.retention_time, 2)}, " f"scan={self.scan}, mobility={np.round(self.mobility, 2)}, ms_type={self.ms_type}, " f"num_peaks={len(self.index)})") + + def to_jsons(self) -> str: + return super().to_jsons() \ No newline at end of file diff --git a/imspy/imspy/isotopes.py b/imspy/imspy/isotopes.py index 49b874df..114ae17e 100644 --- a/imspy/imspy/isotopes.py +++ b/imspy/imspy/isotopes.py @@ -200,7 +200,7 @@ def generate_pattern(self, mass: float, charge: int, k: int = 7, min_intensity: int = 5) -> Tuple[ArrayLike, ArrayLike]: pass - def generate_spectrum(self, mass: int, charge: int, frame_id: int, scan_id: int, k: int = 7, + def generate_spectrum(self, mass: int, charge: int, k: int = 7, amp :Optional[float] = None, resolution:float =3, min_intensity: int = 5, centroided: bool = True) -> MzSpectrum: if amp is None: amp = self.default_abundancy @@ -212,17 +212,15 @@ def generate_spectrum(self, mass: int, charge: int, frame_id: int, scan_id: int, mass=mass, charge=charge, amp=amp, k=k, resolution=resolution) if centroided: - return MzSpectrum(None, frame_id, scan_id, mz, i)\ + return MzSpectrum(mz, i)\ .to_resolution(resolution).filter(lb,ub,min_intensity)\ .to_centroided(baseline_noise_level=max(min_intensity,1), sigma=1/np.power(10,resolution-1)) - return MzSpectrum(None, frame_id, scan_id, mz, i).to_resolution(resolution).filter(lb,ub,min_intensity) + return MzSpectrum(mz, i).to_resolution(resolution).filter(lb,ub,min_intensity) def generate_spectrum_by_sampling(self, mass: float, charge: int, - frame_id:int, - scan_id: int, k:int =7, sigma: ArrayLike = 0.008492569002123142, ion_count:int = 1000000, @@ -235,8 +233,8 @@ def generate_spectrum_by_sampling(self, mz, i = numba_ion_sampler(mass, charge, sigma, k, ion_count, intensity_per_ion) if centroided: - return MzSpectrum(None,frame_id,scan_id,mz,i)\ - .to_resolution(resolution).filter(-1,-1,min_intensity)\ - .to_centroided(baseline_noise_level=max(min_intensity,1), sigma=1/np.power(10,resolution-1)) - return MzSpectrum(None,frame_id,scan_id,mz,i)\ + return MzSpectrum(mz,i)\ + .to_resolution(resolution).filter(-1,-1,min_intensity)\ + .to_centroided(baseline_noise_level=max(min_intensity,1), sigma=1/np.power(10,resolution-1)) #Todo implement centroid method + return MzSpectrum(mz,i)\ .to_resolution(resolution).filter(-1,-1,min_intensity) \ No newline at end of file diff --git a/imspy/imspy/noise.py b/imspy/imspy/noise.py index 1702faef..009aded0 100644 --- a/imspy/imspy/noise.py +++ b/imspy/imspy/noise.py @@ -94,13 +94,13 @@ def baseline_shot_noise_window(window:MzSpectrum, """ num_noise_peaks = np.random.poisson(lam=expected_noise_peaks) - noised_window = MzSpectrum(None,-1,-1,[],[]) + noised_window = MzSpectrum([],[]) for i in range(num_noise_peaks): location_i = np.random.uniform(window_theoretical_mz_min,window_theoretical_mz_max) intensity_i = np.random.exponential(expected_noise_intensity) sigma_i = np.random.exponential(expected_noise_sigma) noise_mz, noise_intensity = generate_noise_peak(location_i,sigma_i,intensity_i,resolution=resolution) - noise_peak = MzSpectrum(None,-1,-1, noise_mz, noise_intensity) + noise_peak = MzSpectrum(noise_mz, noise_intensity) noised_window = noised_window+noise_peak return noised_window diff --git a/imspy/imspy/simulation/experiment.py b/imspy/imspy/simulation/experiment.py index c6896f31..3dc4bd4d 100644 --- a/imspy/imspy/simulation/experiment.py +++ b/imspy/imspy/simulation/experiment.py @@ -106,6 +106,7 @@ def _simulate_features(self, chunk_size): self.ion_mobility_separation_method.run(data_chunk) self.mz_separation_method.run(data_chunk) self.database.update(data_chunk) + @staticmethod def _assemble_frame_range(frame_range, scan_id_min, scan_id_max, default_abundance, resolution, output_path, database_path): diff --git a/imspy/imspy/utility.py b/imspy/imspy/utility.py index 001c0246..ea47e70b 100644 --- a/imspy/imspy/utility.py +++ b/imspy/imspy/utility.py @@ -119,8 +119,8 @@ def get_ref_pattern_as_spectra(ref_patterns): mz = np.round(np.array(mz), 2) + 1.0 i = np.array([int(x) for x in i]) - spectum = MzSpectrum(None, -1, -1, mz, i) - spec_list.append((row['m'], row['z'], spectum, last_contrib)) + spectrum = MzSpectrum(mz, i) + spec_list.append((row['m'], row['z'], spectrum, last_contrib)) return spec_list diff --git a/imspy/pyproject.toml b/imspy/pyproject.toml index e33a348a..f18cf6d7 100644 --- a/imspy/pyproject.toml +++ b/imspy/pyproject.toml @@ -12,6 +12,13 @@ numpy = ">=1.21" tensorflow = ">=2.14" tensorflow-probability = ">=0.22.1" imspy-connector = ">=0.2.0" +mendeleev = ">=0.14" +pyopenms = ">=3.1" +scipy = ">=1.11.2" +tqdm = ">=4.66" +pyarrow =">=13.0" +numba = ">=0.57" + [build-system] requires = ["poetry-core"] diff --git a/imspy_connector/src/py_mz_spectrum.rs b/imspy_connector/src/py_mz_spectrum.rs index 8a53c75e..70f09c35 100644 --- a/imspy_connector/src/py_mz_spectrum.rs +++ b/imspy_connector/src/py_mz_spectrum.rs @@ -234,4 +234,8 @@ impl PyTimsSpectrum { pub fn mz_spectrum(&self) -> PyMzSpectrum { PyMzSpectrum { inner: self.inner.spectrum.mz_spectrum.clone() } } + + pub fn to_resolution(&self, resolution: i32) -> Self { + PyTimsSpectrum{ inner: self.inner.to_resolution(resolution)} + } } From 338abe9f70b5753458d9cac715c2f7dcd7b65761 Mon Sep 17 00:00:00 2001 From: "Tim Maier (Ubuntu Desktop)" Date: Sat, 4 Nov 2023 01:44:55 +0100 Subject: [PATCH 4/7] first syn run without runtime error --- imspy/imspy/data/spectrum.py | 38 +++++++++++++++++------ imspy/imspy/feature.py | 2 +- imspy/imspy/isotopes.py | 16 +++++----- imspy/imspy/noise.py | 4 +-- imspy/imspy/proteome.py | 2 +- imspy/imspy/simulation/experiment.py | 6 ++-- imspy/imspy/simulation/hardware_models.py | 4 +-- imspy_connector/src/py_mz_spectrum.rs | 28 ++++++++++++++++- mscore/src/mz_spectrum.rs | 11 +++++++ 9 files changed, 84 insertions(+), 27 deletions(-) diff --git a/imspy/imspy/data/spectrum.py b/imspy/imspy/data/spectrum.py index cb6c7559..2bb3a4c8 100644 --- a/imspy/imspy/data/spectrum.py +++ b/imspy/imspy/data/spectrum.py @@ -86,11 +86,20 @@ def from_jsons(cls, jsons:str) -> MzSpectrum: json_dict:dict = json.loads(jsons) mz = json_dict["mz"] intensity = json_dict["intensity"] - return cls(mz, intensity) + return cls(np.array(mz, dtype=np.float64), np.array(intensity, dtype=np.float64)) @classmethod - def from_spectra_list(cls, spectra_list:List[MzSpectrum], resolution: int, centroided: bool)->MzSpectrum: - pass + def from_mz_spectra_list(cls, spectra_list:List[MzSpectrum], resolution: int)->MzSpectrum: + """Generates a convoluted mass spectrum by adding all spectra in the given list. + + Args: + spectra_list (List[MzSpectrum]): List of mass spectra. + resolution (int): Desired resolution of returned spectrum. + + Returns: + MzSpectrum: Convoluted spectrum. + """ + return cls.from_py_mz_spectrum(pims.PyMzSpectrum.from_mzspectra_list([spectrum.__spec_ptr for spectrum in spectra_list], resolution)) def __init__(self, mz: NDArray[np.float64], intensity: NDArray[np.float64]): """MzSpectrum class. @@ -149,6 +158,18 @@ def from_py_mz_spectrum(cls, spec: pims.PyMzSpectrum): def __repr__(self): return f"MzSpectrum(num_peaks={len(self.mz)})" + + def __mul__(self, scale) -> MzSpectrum: + """Overwrite * operator for scaling of spectrum + + Args: + scale (float): Scale. + + Returns: + MzSpectrum: Scaled spectrum + """ + tmp: pims.PyMzSpectrum = self.__spec_ptr * scale + return self.from_py_mz_spectrum(tmp) def to_windows(self, window_length: float = 10, overlapping: bool = True, min_num_peaks: int = 5, min_intensity: float = 1) -> Tuple[NDArray, List[MzSpectrum]]: @@ -177,7 +198,7 @@ def to_resolution(self, resolution: int) -> MzSpectrum: Returns: MzSpectrum: A new `MzSpectrum` where m/z values are binned according to the given resolution. """ - return self.__spec_ptr.to_resolution(resolution) + return MzSpectrum.from_py_mz_spectrum(self.__spec_ptr.to_resolution(resolution)) def filter(self, mz_min: float = 0.0, mz_max: float = 2000.0, intensity_min: float = 0.0, intensity_max: float = 1e9) -> MzSpectrum: @@ -211,8 +232,8 @@ def to_jsons(self) -> str: generates json string representation of MzSpectrum """ json_dict = {} - json_dict["mz"] = self.mz().tolist() - json_dict["intensity"] = self.intensity().tolist() + json_dict["mz"] = self.mz.tolist() + json_dict["intensity"] = self.intensity.tolist() return json.dumps(json_dict) @@ -276,7 +297,7 @@ def __repr__(self): return f"MzSpectrumVectorized(num_values={len(self.values)})" -class TimsSpectrum(AbstractSpectrum): +class TimsSpectrum: def __init__(self, frame_id: int, scan: int, retention_time: float, mobility: float, ms_type: int, index: NDArray[np.int32], mz: NDArray[np.float64], intensity: NDArray[np.float64]): """TimsSpectrum class. @@ -410,6 +431,3 @@ def __repr__(self): return (f"TimsSpectrum(id={self.frame_id}, retention_time={np.round(self.retention_time, 2)}, " f"scan={self.scan}, mobility={np.round(self.mobility, 2)}, ms_type={self.ms_type}, " f"num_peaks={len(self.index)})") - - def to_jsons(self) -> str: - return super().to_jsons() \ No newline at end of file diff --git a/imspy/imspy/feature.py b/imspy/imspy/feature.py index 2124b5f8..eb30dddd 100644 --- a/imspy/imspy/feature.py +++ b/imspy/imspy/feature.py @@ -123,7 +123,7 @@ def generate_feature(self, mz: float, charge: int, frame_list = [] spec = pattern_generator.generate_spectrum(mz, charge, min_intensity=5, centroided=True, k=12) - mz, intensity = spec.mz(), spec.intensity() / np.max(spec.intensity()) * 100 + mz, intensity = spec.mz, spec.intensity / np.max(spec.intensity) * 100 for i in range(num_rt): diff --git a/imspy/imspy/isotopes.py b/imspy/imspy/isotopes.py index 114ae17e..458e94fb 100644 --- a/imspy/imspy/isotopes.py +++ b/imspy/imspy/isotopes.py @@ -123,7 +123,7 @@ def numba_generate_pattern(lower_bound: float, mzs = np.linspace(lower_bound,upper_bound,size) intensities = iso(mzs,mass,charge,sigma,amp,k,step_size) - return mzs + MASS_PROTON, intensities.astype(np.int64) + return mzs + MASS_PROTON, intensities #@numba.jit(nopython= True) def numba_ion_sampler(mass: float, charge: int, sigma: ArrayLike, k: int, ion_count: int, intensity_per_ion: int ): @@ -158,7 +158,7 @@ def _get_component(u: float, weights_cum_sum: ArrayLike = weights) -> int: mz = ((comps*MASS_NEUTRON) + mass) / charge + MASS_PROTON + devs i = np.ones_like(mz) * intensity_per_ion - return (mz, i.astype(np.int64)) + return (mz, i) @numba.jit def create_initial_feature_distribution(num_rt: int, num_im: int, @@ -200,8 +200,8 @@ def generate_pattern(self, mass: float, charge: int, k: int = 7, min_intensity: int = 5) -> Tuple[ArrayLike, ArrayLike]: pass - def generate_spectrum(self, mass: int, charge: int, k: int = 7, - amp :Optional[float] = None, resolution:float =3, min_intensity: int = 5, centroided: bool = True) -> MzSpectrum: + def generate_spectrum(self, mass: int, charge: int, min_intensity: int = 5, k: int = 7, + amp :Optional[float] = None, resolution:float =3, centroided: bool = True) -> MzSpectrum: if amp is None: amp = self.default_abundancy @@ -210,6 +210,7 @@ def generate_spectrum(self, mass: int, charge: int, k: int = 7, mz, i = numba_generate_pattern(lower_bound=lb, upper_bound=ub, mass=mass, charge=charge, amp=amp, k=k, resolution=resolution) + #Todo implement centroid method if centroided: return MzSpectrum(mz, i)\ @@ -233,8 +234,9 @@ def generate_spectrum_by_sampling(self, mz, i = numba_ion_sampler(mass, charge, sigma, k, ion_count, intensity_per_ion) if centroided: - return MzSpectrum(mz,i)\ - .to_resolution(resolution).filter(-1,-1,min_intensity)\ - .to_centroided(baseline_noise_level=max(min_intensity,1), sigma=1/np.power(10,resolution-1)) #Todo implement centroid method + return ( MzSpectrum(mz,i) + .to_resolution(resolution).filter(-1,-1,min_intensity) + .to_centroided(baseline_noise_level=max(min_intensity,1), sigma=1/np.power(10,resolution-1)) + ) return MzSpectrum(mz,i)\ .to_resolution(resolution).filter(-1,-1,min_intensity) \ No newline at end of file diff --git a/imspy/imspy/noise.py b/imspy/imspy/noise.py index 009aded0..daef5073 100644 --- a/imspy/imspy/noise.py +++ b/imspy/imspy/noise.py @@ -110,8 +110,8 @@ def baseline_shot_noise(spectrum:MzSpectrum,window_size:float=50,expected_noise_ """ - min_mz = spectrum.mz().min()-0.1 - max_mz = spectrum.mz().max()+0.1 + min_mz = spectrum.mz.min()-0.1 + max_mz = spectrum.mz.max()+0.1 bins,windows = spectrum.windows(window_size,overlapping=False,min_peaks=0,min_intensity=0) noised_spectrum = spectrum for b,w in zip(bins,windows): diff --git a/imspy/imspy/proteome.py b/imspy/imspy/proteome.py index e4490467..5a601e64 100644 --- a/imspy/imspy/proteome.py +++ b/imspy/imspy/proteome.py @@ -247,7 +247,7 @@ def make_sql_compatible(column): if isinstance(column.iloc[0], (RTProfile, ScanProfile, ChargeProfile, TokenSequence)): return column.apply(lambda x: x.jsons) elif isinstance(column.iloc[0], MzSpectrum): - return column.apply(lambda x: x.to_jsons(only_spectrum = True)) + return column.apply(lambda x: x.to_jsons()) else: return column diff --git a/imspy/imspy/simulation/experiment.py b/imspy/imspy/simulation/experiment.py index 3dc4bd4d..2b4846d4 100644 --- a/imspy/imspy/simulation/experiment.py +++ b/imspy/imspy/simulation/experiment.py @@ -167,9 +167,9 @@ def _assemble_frame_range(frame_range, scan_id_min, scan_id_max, default_abundan for (s_id,scan_spectra_list) in frame_dict.items(): if len(scan_spectra_list) > 0: - scan_spectrum = MzSpectrum.from_mzSpectra_list(scan_spectra_list,resolution = resolution, sigma=1/np.power(10,(resolution-1))) - output_dict["mz"].append(scan_spectrum.mz().tolist()) - output_dict["intensity"].append(scan_spectrum.intensity().tolist()) + scan_spectrum = MzSpectrum.from_mz_spectra_list(scan_spectra_list,resolution = resolution) + output_dict["mz"].append(scan_spectrum.mz.tolist()) + output_dict["intensity"].append(scan_spectrum.intensity.tolist()) output_dict["scan_id"].append(s_id) output_dict["frame_id"].append(f_id) signal[f_id][s_id] = None diff --git a/imspy/imspy/simulation/hardware_models.py b/imspy/imspy/simulation/hardware_models.py index 2a0b2811..84a82c3f 100644 --- a/imspy/imspy/simulation/hardware_models.py +++ b/imspy/imspy/simulation/hardware_models.py @@ -761,6 +761,6 @@ def simulate(self, sample: ProteomicsExperimentSampleSlice, device: MzSeparation charges = joined_df["charge"].values spectra = [] for (m, c) in zip(masses, charges): - spectrum = avg.generate_spectrum(m,c,-1,-1,amp = self.default_abundance, centroided=False) + spectrum = avg.generate_spectrum(m,c,amp = self.default_abundance, centroided=False) spectra.append(spectrum) - return spectra \ No newline at end of file + return spectra diff --git a/imspy_connector/src/py_mz_spectrum.rs b/imspy_connector/src/py_mz_spectrum.rs index 70f09c35..0bfb1020 100644 --- a/imspy_connector/src/py_mz_spectrum.rs +++ b/imspy_connector/src/py_mz_spectrum.rs @@ -25,11 +25,13 @@ impl PyMsType { } #[pyclass] +#[derive(Clone)] pub struct PyMzSpectrum { pub inner: MzSpectrum, } #[pymethods] + impl PyMzSpectrum { #[new] pub unsafe fn new(mz: &PyArray1, intensity: &PyArray1) -> PyResult { @@ -37,9 +39,27 @@ impl PyMzSpectrum { inner: MzSpectrum { mz: mz.as_slice()?.to_vec(), intensity: intensity.as_slice()?.to_vec(), - }, + } }) } + #[staticmethod] + pub unsafe fn from_mzspectra_list(list: Vec, resolution: i32) -> PyResult { + if list.is_empty(){ + Ok(PyMzSpectrum { + inner: MzSpectrum { + mz: Vec::new(), + intensity: Vec::new(), + } + }) + } + else { + let mut convoluted: MzSpectrum = MzSpectrum { mz: vec![], intensity: vec![] }; + for spectrum in list { + convoluted = convoluted + spectrum.inner; + } + Ok(PyMzSpectrum { inner: convoluted.to_resolution(resolution) }) + } + } #[getter] pub fn mz(&self, py: Python) -> Py> { @@ -86,6 +106,12 @@ impl PyMzSpectrum { }; Ok(py_filtered) } + pub fn __add__(&self, other: PyMzSpectrum) -> PyResult { + Ok(PyMzSpectrum { inner: (self.inner.clone() + other.inner) }) + } + pub fn __mul__(&self, scale: f64) -> PyResult { + Ok(PyMzSpectrum { inner: (self.inner.clone() * scale) }) + } } #[pyclass] diff --git a/mscore/src/mz_spectrum.rs b/mscore/src/mz_spectrum.rs index 8d0aa497..ef40e72a 100644 --- a/mscore/src/mz_spectrum.rs +++ b/mscore/src/mz_spectrum.rs @@ -293,6 +293,17 @@ impl std::ops::Add for MzSpectrum { } } +impl std::ops::Mul for MzSpectrum { + type Output = Self; + fn mul(self, scale: f64) -> Self::Output{ + let mut scaled_intensities: Vec = vec![0.0; self.intensity.len()]; + for (idx,intensity) in self.intensity.iter().enumerate(){ + scaled_intensities[idx] = scale*intensity; + } + Self{ mz: self.mz.clone(), intensity: scaled_intensities} + + } +} /// Represents a mass spectrum with associated m/z indices, m/z values, and intensities #[derive(Clone)] pub struct IndexedMzSpectrum { From 654ddd1f9f5b54ff4587a1e7f3defbd7a1c88566 Mon Sep 17 00:00:00 2001 From: "Tim Maier (Ubuntu Desktop)" Date: Thu, 9 Nov 2023 10:11:46 +0100 Subject: [PATCH 5/7] renamed module to simplify rebase --- .../simulation/run_example_simulation.py | 0 imspy/imspy/chemistry.py | 213 ++++++++++++++++++ imspy/imspy/data/__init__.py | 8 +- imspy/imspy/data/slice.py | 10 +- imspy/imspy/feature.py | 6 +- imspy/imspy/isotopes.py | 6 +- imspy/imspy/noise.py | 4 +- imspy/imspy/proteome.py | 8 +- imspy/imspy/simulation/experiment.py | 8 +- imspy/imspy/simulation/hardware_models.py | 10 +- imspy/imspy/utility.py | 4 +- pyims/pyims/chemistry.py | 213 ------------------ 12 files changed, 242 insertions(+), 248 deletions(-) rename {pyims => imspy}/examples/simulation/run_example_simulation.py (100%) delete mode 100644 pyims/pyims/chemistry.py diff --git a/pyims/examples/simulation/run_example_simulation.py b/imspy/examples/simulation/run_example_simulation.py similarity index 100% rename from pyims/examples/simulation/run_example_simulation.py rename to imspy/examples/simulation/run_example_simulation.py diff --git a/imspy/imspy/chemistry.py b/imspy/imspy/chemistry.py index e69de29b..b5b7d6b3 100644 --- a/imspy/imspy/chemistry.py +++ b/imspy/imspy/chemistry.py @@ -0,0 +1,213 @@ +import numpy as np +import mendeleev as me + +AMINO_ACIDS = {'Lysine': 'K', 'Alanine': 'A', 'Glycine': 'G', 'Valine': 'V', 'Tyrosine': 'Y', + 'Arginine': 'R', 'Glutamic Acid': 'E', 'Phenylalanine': 'F', 'Tryptophan': 'W', + 'Leucine': 'L', 'Threonine': 'T', 'Cysteine': 'C', 'Serine': 'S', 'Glutamine': 'Q', + 'Methionine': 'M', 'Isoleucine': 'I', 'Asparagine': 'N', 'Proline': 'P', 'Histidine': 'H', + 'Aspartic Acid': 'D'} + +AA_MASSES = {'A': 71.03711, 'C': 103.00919, 'D': 115.02694, 'E': 129.04259, 'F': 147.06841, 'G': 57.02146, + 'H': 137.05891, 'I': 113.08406, 'K': 128.09496, 'L': 113.08406, 'M': 131.04049, 'N': 114.04293, + 'P': 97.05276, 'Q': 128.05858, 'R': 156.10111, 'S': 87.03203, 'T': 101.04768, 'V': 99.06841, + 'W': 186.07931, 'Y': 163.06333, '[UNIMOD:1]': 42.010565, '[UNIMOD:35]': 15.994915, 'U': 168.964203, + '[UNIMOD:4]': 57.021464, '[UNIMOD:21]': 79.966331, '[UNIMOD:312]': 119.004099, '': 0.0, '': 0.0} + +VARIANT_DICT = {'L': ['L'], 'E': ['E'], 'S': ['S', 'S[UNIMOD:21]'], 'A': ['A'], 'V': ['V'], 'D': ['D'], 'G': ['G'], + '': [''], 'P': ['P'], '': ['', '[UNIMOD:1]'], 'T': ['T', 'T[UNIMOD:21]'], + 'I': ['I'], 'Q': ['Q'], 'K': ['K', 'K[UNIMOD:1]'], 'N': ['N'], 'R': ['R'], 'F': ['F'], 'H': ['H'], + 'Y': ['Y', 'Y[UNIMOD:21]'], 'M': ['M', 'M[UNIMOD:35]'], + 'W': ['W'], 'C': ['C', 'C[UNIMOD:312]', 'C[UNIMOD:4]'], 'C[UNIMOD:4]': ['C', 'C[UNIMOD:312]', 'C[UNIMOD:4]']} + +MASS_PROTON = 1.007276466583 + +MASS_WATER = 18.010564684 +# IUPAC standard in Kelvin +STANDARD_TEMPERATURE = 273.15 +# IUPAC standard in Pa +STANDARD_PRESSURE = 1e5 +# IUPAC elementary charge +ELEMENTARY_CHARGE = 1.602176634e-19 +# IUPAC BOLTZMANN'S CONSTANT +K_BOLTZMANN = 1.380649e-23 +# constant part of Mason-Schamp equation +# 3/16*sqrt(2π/kb)*e/N0 * +# 1e20 (correction for using A² instead of m²) * +# 1/sqrt(1.660 5402(10)×10−27 kg) (correction for using Da instead of kg) * +# 10000 * (to get cm²/Vs from m²/Vs) +# TODO CITATION +CCS_K0_CONVERSION_CONSTANT = 18509.8632163405 + +def get_monoisotopic_token_weight(token:str): + """ + Gets monoisotopic weight of token + + :param token: Token of aa sequence e.g. "[UNIMOD:1]" + :type token: str + :return: Weight in Dalton. + :rtype: float + """ + splits = token.split("[") + for i in range(1, len(splits)): + splits[i] = "["+splits[i] + + mass = 0 + for split in splits: + mass += AA_MASSES[split] + return mass + + +def get_mono_isotopic_weight(sequence_tokenized: list[str]) -> float: + mass = 0 + for token in sequence_tokenized: + mass += get_monoisotopic_token_weight(token) + return mass + MASS_WATER + + +def get_mass_over_charge(mass: float, charge: int) -> float: + return (mass / charge) + MASS_PROTON + +def get_num_protonizable_sites(sequence: str) -> int: + """ + Gets number of sites that can be protonized. + This function does not yet account for PTMs. + + :param sequence: Amino acid sequence + :type sequence: str + :return: Number of protonizable sites + :rtype: int + """ + sites = 1 # n-terminus + for s in sequence: + if s in ["H","R","K"]: + sites += 1 + return sites + + +def reduced_mobility_to_ccs(one_over_k0, mz, charge, mass_gas=28.013, temp=31.85, t_diff=273.15): + """ + convert reduced ion mobility (1/k0) to CCS + :param one_over_k0: reduced ion mobility + :param charge: charge state of the ion + :param mz: mass-over-charge of the ion + :param mass_gas: mass of drift gas + :param temp: temperature of the drift gas in C° + :param t_diff: factor to translate from C° to K + """ + reduced_mass = (mz * charge * mass_gas) / (mz * charge + mass_gas) + return (CCS_K0_CONVERSION_CONSTANT * charge) / (np.sqrt(reduced_mass * (temp + t_diff)) * 1 / one_over_k0) + + +def ccs_to_one_over_reduced_mobility(ccs, mz, charge, mass_gas=28.013, temp=31.85, t_diff=273.15): + """ + convert CCS to 1 over reduced ion mobility (1/k0) + :param ccs: collision cross-section + :param charge: charge state of the ion + :param mz: mass-over-charge of the ion + :param mass_gas: mass of drift gas (N2) + :param temp: temperature of the drift gas in C° + :param t_diff: factor to translate from C° to K + """ + reduced_mass = (mz * charge * mass_gas) / (mz * charge + mass_gas) + return ((np.sqrt(reduced_mass * (temp + t_diff))) * ccs) / (CCS_K0_CONVERSION_CONSTANT * charge) + + +class ChemicalCompound: + + def _calculate_molecular_mass(self): + mass = 0 + for (atom, abundance) in self.element_composition.items(): + mass += me.element(atom).atomic_weight * abundance + return mass + + def __init__(self, formula): + self.element_composition = self.get_composition(formula) + self.mass = self._calculate_molecular_mass() + + def get_composition(self, formula:str): + """ + Parse chemical formula into Dict[str:int] with + atoms as keys and the respective counts as values. + + :param formula: Chemical formula of compound e.g. 'C6H12O6' + :type formula: str + :return: Dictionary Atom: Count + :rtype: Dict[str:int] + """ + if formula.startswith("("): + assert formula.endswith(")") + formula = formula[1:-1] + + tmp_group = "" + tmp_group_count = "" + depth = 0 + comp_list = [] + comp_counts = [] + + # extract components: everything in brackets and atoms + # extract component counts: number behind component or 1 + for (i,e) in enumerate(formula): + if e == "(": + depth += 1 + if depth == 1: + if tmp_group != "": + comp_list.append(tmp_group) + tmp_group = "" + if tmp_group_count == "": + comp_counts.append(1) + else: + comp_counts.append(int(tmp_group_count)) + tmp_group_count = "" + tmp_group += e + continue + if e == ")": + depth -= 1 + tmp_group += e + continue + if depth > 0: + tmp_group += e + continue + if e.isupper(): + if tmp_group != "": + comp_list.append(tmp_group) + tmp_group = "" + if tmp_group_count == "": + comp_counts.append(1) + else: + comp_counts.append(int(tmp_group_count)) + tmp_group_count = "" + tmp_group += e + continue + if e.islower(): + tmp_group += e + continue + if e.isnumeric(): + tmp_group_count += e + if tmp_group != "": + comp_list.append(tmp_group) + if tmp_group_count == "": + comp_counts.append(1) + else: + comp_counts.append(int(tmp_group_count)) + + # assemble dictionary from component lists + atom_dict = {} + for (comp,count) in zip(comp_list,comp_counts): + if not comp.startswith("("): + atom_dict[comp] = count + else: + atom_dicts_depth = self.get_composition(comp) + for atom in atom_dicts_depth: + atom_dicts_depth[atom] *= count + if atom in atom_dict: + atom_dict[atom] += atom_dicts_depth[atom] + else: + atom_dict[atom] = atom_dicts_depth[atom] + atom_dicts_depth = {} + return atom_dict + +class BufferGas(ChemicalCompound): + + def __init__(self, formula: str): + super().__init__(formula) + diff --git a/imspy/imspy/data/__init__.py b/imspy/imspy/data/__init__.py index 1809a4f5..75c253f3 100644 --- a/imspy/imspy/data/__init__.py +++ b/imspy/imspy/data/__init__.py @@ -1,4 +1,4 @@ -from pyims.data.frame import TimsFrame -from pyims.data.spectrum import TimsSpectrum, MzSpectrum -from pyims.data.handle import TimsDataset, TimsDatasetDDA, TimsDatasetDIA -from pyims.data.slice import TimsSlice +from imspy.data.frame import TimsFrame +from imspy.data.spectrum import TimsSpectrum, MzSpectrum +from imspy.data.handle import TimsDataset, TimsDatasetDDA, TimsDatasetDIA +from imspy.data.slice import TimsSlice diff --git a/imspy/imspy/data/slice.py b/imspy/imspy/data/slice.py index 056febaa..9e43bd5c 100644 --- a/imspy/imspy/data/slice.py +++ b/imspy/imspy/data/slice.py @@ -7,15 +7,9 @@ from imspy.utilities import re_index_indices -<<<<<<<< HEAD:imspy/imspy/slice.py import imspy_connector as pims -from imspy.frame import TimsFrame, TimsFrameVectorized -from imspy.spectrum import MzSpectrum -======== -import pyims_connector as pims -from pyims.data.frame import TimsFrame, TimsFrameVectorized -from pyims.data.spectrum import MzSpectrum ->>>>>>>> 1cd02c1 (transfer simulation framework from proteolizard):imspy/imspy/data/slice.py +from imspy.data.frame import TimsFrame, TimsFrameVectorized +from imspy.data.spectrum import MzSpectrum class TimsSlice: diff --git a/imspy/imspy/feature.py b/imspy/imspy/feature.py index eb30dddd..53ab596f 100644 --- a/imspy/imspy/feature.py +++ b/imspy/imspy/feature.py @@ -4,9 +4,9 @@ import json -from pyims.data import TimsSlice, TimsFrame -from pyims.utility import gaussian, exp_gaussian -from pyims.isotopes import IsotopePatternGenerator, create_initial_feature_distribution +from imspy.data import TimsSlice, TimsFrame +from imspy.utility import gaussian, exp_gaussian +from imspy.isotopes import IsotopePatternGenerator, create_initial_feature_distribution from abc import ABC, abstractmethod from typing import Optional, Dict class Profile: diff --git a/imspy/imspy/isotopes.py b/imspy/imspy/isotopes.py index 458e94fb..da75dae2 100644 --- a/imspy/imspy/isotopes.py +++ b/imspy/imspy/isotopes.py @@ -6,12 +6,12 @@ from abc import ABC, abstractmethod from scipy.signal import argrelextrema -from pyims.data import MzSpectrum -from pyims.utility import gaussian, exp_gaussian, normal_pdf +from imspy.data import MzSpectrum +from imspy.utility import gaussian, exp_gaussian, normal_pdf import numba import pyopenms -from pyims.noise import detection_noise +from imspy.noise import detection_noise MASS_PROTON = 1.007276466621 MASS_NEUTRON = 1.00866491595 diff --git a/imspy/imspy/noise.py b/imspy/imspy/noise.py index daef5073..c2cf163b 100644 --- a/imspy/imspy/noise.py +++ b/imspy/imspy/noise.py @@ -5,8 +5,8 @@ import numba from typing import Callable, Optional, Tuple from numpy.typing import ArrayLike -from pyims.data import MzSpectrum -from pyims.utility import normal_pdf +from imspy.data import MzSpectrum +from imspy.utility import normal_pdf @numba.jit(nopython=True) def mu_function_normal_default(intensity: ArrayLike) -> ArrayLike: diff --git a/imspy/imspy/proteome.py b/imspy/imspy/proteome.py index 5a601e64..85103430 100644 --- a/imspy/imspy/proteome.py +++ b/imspy/imspy/proteome.py @@ -5,11 +5,11 @@ from numpy.typing import ArrayLike import pandas as pd import sqlite3 -from pyims.data import MzSpectrum -from pyims.chemistry import get_mono_isotopic_weight, MASS_PROTON +from imspy.data import MzSpectrum +from imspy.chemistry import get_mono_isotopic_weight, MASS_PROTON -from pyims.feature import RTProfile, ScanProfile, ChargeProfile -from pyims.utility import tokenize_proforma_sequence, TokenSequence, get_aa_num_proforma_sequence +from imspy.feature import RTProfile, ScanProfile, ChargeProfile +from imspy.utility import tokenize_proforma_sequence, TokenSequence, get_aa_num_proforma_sequence from enum import Enum from abc import ABC, abstractmethod diff --git a/imspy/imspy/simulation/experiment.py b/imspy/imspy/simulation/experiment.py index 2b4846d4..0f53cd20 100644 --- a/imspy/imspy/simulation/experiment.py +++ b/imspy/imspy/simulation/experiment.py @@ -8,10 +8,10 @@ import pyarrow as pa import pyarrow.parquet as pq from tqdm import tqdm -from pyims.data import MzSpectrum, TimsFrame -from pyims.proteome import PeptideDigest, ProteomicsExperimentSampleSlice, ProteomicsExperimentDatabaseHandle -from pyims.isotopes import AveragineGenerator -import pyims.simulation.hardware_models as hardware +from imspy.data import MzSpectrum, TimsFrame +from imspy.proteome import PeptideDigest, ProteomicsExperimentSampleSlice, ProteomicsExperimentDatabaseHandle +from imspy.isotopes import AveragineGenerator +import imspy.simulation.hardware_models as hardware class ProteomicsExperiment(ABC): def __init__(self, path: str): diff --git a/imspy/imspy/simulation/hardware_models.py b/imspy/imspy/simulation/hardware_models.py index 84a82c3f..a4d24e3e 100644 --- a/imspy/imspy/simulation/hardware_models.py +++ b/imspy/imspy/simulation/hardware_models.py @@ -9,11 +9,11 @@ import pandas as pd from scipy.stats import exponnorm, norm, binom, gamma -from pyims.chemistry import STANDARD_TEMPERATURE, STANDARD_PRESSURE, CCS_K0_CONVERSION_CONSTANT, BufferGas, get_num_protonizable_sites -from pyims.proteome import ProteomicsExperimentSampleSlice -from pyims.feature import RTProfile, ScanProfile, ChargeProfile -from pyims.isotopes import AveragineGenerator -from pyims.utility import tokenizer_from_json +from imspy.chemistry import STANDARD_TEMPERATURE, STANDARD_PRESSURE, CCS_K0_CONVERSION_CONSTANT, BufferGas, get_num_protonizable_sites +from imspy.proteome import ProteomicsExperimentSampleSlice +from imspy.feature import RTProfile, ScanProfile, ChargeProfile +from imspy.isotopes import AveragineGenerator +from imspy.utility import tokenizer_from_json class Device(ABC): def __init__(self, name:str): diff --git a/imspy/imspy/utility.py b/imspy/imspy/utility.py index ea47e70b..dd504b39 100644 --- a/imspy/imspy/utility.py +++ b/imspy/imspy/utility.py @@ -10,8 +10,8 @@ import pandas as pd from tqdm import tqdm -from pyims.data import MzSpectrum -# TODO bring to pyims +from imspy.data import MzSpectrum +# TODO bring to imspy #from proteolizardalgo.hashing import ReferencePattern diff --git a/pyims/pyims/chemistry.py b/pyims/pyims/chemistry.py deleted file mode 100644 index b5b7d6b3..00000000 --- a/pyims/pyims/chemistry.py +++ /dev/null @@ -1,213 +0,0 @@ -import numpy as np -import mendeleev as me - -AMINO_ACIDS = {'Lysine': 'K', 'Alanine': 'A', 'Glycine': 'G', 'Valine': 'V', 'Tyrosine': 'Y', - 'Arginine': 'R', 'Glutamic Acid': 'E', 'Phenylalanine': 'F', 'Tryptophan': 'W', - 'Leucine': 'L', 'Threonine': 'T', 'Cysteine': 'C', 'Serine': 'S', 'Glutamine': 'Q', - 'Methionine': 'M', 'Isoleucine': 'I', 'Asparagine': 'N', 'Proline': 'P', 'Histidine': 'H', - 'Aspartic Acid': 'D'} - -AA_MASSES = {'A': 71.03711, 'C': 103.00919, 'D': 115.02694, 'E': 129.04259, 'F': 147.06841, 'G': 57.02146, - 'H': 137.05891, 'I': 113.08406, 'K': 128.09496, 'L': 113.08406, 'M': 131.04049, 'N': 114.04293, - 'P': 97.05276, 'Q': 128.05858, 'R': 156.10111, 'S': 87.03203, 'T': 101.04768, 'V': 99.06841, - 'W': 186.07931, 'Y': 163.06333, '[UNIMOD:1]': 42.010565, '[UNIMOD:35]': 15.994915, 'U': 168.964203, - '[UNIMOD:4]': 57.021464, '[UNIMOD:21]': 79.966331, '[UNIMOD:312]': 119.004099, '': 0.0, '': 0.0} - -VARIANT_DICT = {'L': ['L'], 'E': ['E'], 'S': ['S', 'S[UNIMOD:21]'], 'A': ['A'], 'V': ['V'], 'D': ['D'], 'G': ['G'], - '': [''], 'P': ['P'], '': ['', '[UNIMOD:1]'], 'T': ['T', 'T[UNIMOD:21]'], - 'I': ['I'], 'Q': ['Q'], 'K': ['K', 'K[UNIMOD:1]'], 'N': ['N'], 'R': ['R'], 'F': ['F'], 'H': ['H'], - 'Y': ['Y', 'Y[UNIMOD:21]'], 'M': ['M', 'M[UNIMOD:35]'], - 'W': ['W'], 'C': ['C', 'C[UNIMOD:312]', 'C[UNIMOD:4]'], 'C[UNIMOD:4]': ['C', 'C[UNIMOD:312]', 'C[UNIMOD:4]']} - -MASS_PROTON = 1.007276466583 - -MASS_WATER = 18.010564684 -# IUPAC standard in Kelvin -STANDARD_TEMPERATURE = 273.15 -# IUPAC standard in Pa -STANDARD_PRESSURE = 1e5 -# IUPAC elementary charge -ELEMENTARY_CHARGE = 1.602176634e-19 -# IUPAC BOLTZMANN'S CONSTANT -K_BOLTZMANN = 1.380649e-23 -# constant part of Mason-Schamp equation -# 3/16*sqrt(2π/kb)*e/N0 * -# 1e20 (correction for using A² instead of m²) * -# 1/sqrt(1.660 5402(10)×10−27 kg) (correction for using Da instead of kg) * -# 10000 * (to get cm²/Vs from m²/Vs) -# TODO CITATION -CCS_K0_CONVERSION_CONSTANT = 18509.8632163405 - -def get_monoisotopic_token_weight(token:str): - """ - Gets monoisotopic weight of token - - :param token: Token of aa sequence e.g. "[UNIMOD:1]" - :type token: str - :return: Weight in Dalton. - :rtype: float - """ - splits = token.split("[") - for i in range(1, len(splits)): - splits[i] = "["+splits[i] - - mass = 0 - for split in splits: - mass += AA_MASSES[split] - return mass - - -def get_mono_isotopic_weight(sequence_tokenized: list[str]) -> float: - mass = 0 - for token in sequence_tokenized: - mass += get_monoisotopic_token_weight(token) - return mass + MASS_WATER - - -def get_mass_over_charge(mass: float, charge: int) -> float: - return (mass / charge) + MASS_PROTON - -def get_num_protonizable_sites(sequence: str) -> int: - """ - Gets number of sites that can be protonized. - This function does not yet account for PTMs. - - :param sequence: Amino acid sequence - :type sequence: str - :return: Number of protonizable sites - :rtype: int - """ - sites = 1 # n-terminus - for s in sequence: - if s in ["H","R","K"]: - sites += 1 - return sites - - -def reduced_mobility_to_ccs(one_over_k0, mz, charge, mass_gas=28.013, temp=31.85, t_diff=273.15): - """ - convert reduced ion mobility (1/k0) to CCS - :param one_over_k0: reduced ion mobility - :param charge: charge state of the ion - :param mz: mass-over-charge of the ion - :param mass_gas: mass of drift gas - :param temp: temperature of the drift gas in C° - :param t_diff: factor to translate from C° to K - """ - reduced_mass = (mz * charge * mass_gas) / (mz * charge + mass_gas) - return (CCS_K0_CONVERSION_CONSTANT * charge) / (np.sqrt(reduced_mass * (temp + t_diff)) * 1 / one_over_k0) - - -def ccs_to_one_over_reduced_mobility(ccs, mz, charge, mass_gas=28.013, temp=31.85, t_diff=273.15): - """ - convert CCS to 1 over reduced ion mobility (1/k0) - :param ccs: collision cross-section - :param charge: charge state of the ion - :param mz: mass-over-charge of the ion - :param mass_gas: mass of drift gas (N2) - :param temp: temperature of the drift gas in C° - :param t_diff: factor to translate from C° to K - """ - reduced_mass = (mz * charge * mass_gas) / (mz * charge + mass_gas) - return ((np.sqrt(reduced_mass * (temp + t_diff))) * ccs) / (CCS_K0_CONVERSION_CONSTANT * charge) - - -class ChemicalCompound: - - def _calculate_molecular_mass(self): - mass = 0 - for (atom, abundance) in self.element_composition.items(): - mass += me.element(atom).atomic_weight * abundance - return mass - - def __init__(self, formula): - self.element_composition = self.get_composition(formula) - self.mass = self._calculate_molecular_mass() - - def get_composition(self, formula:str): - """ - Parse chemical formula into Dict[str:int] with - atoms as keys and the respective counts as values. - - :param formula: Chemical formula of compound e.g. 'C6H12O6' - :type formula: str - :return: Dictionary Atom: Count - :rtype: Dict[str:int] - """ - if formula.startswith("("): - assert formula.endswith(")") - formula = formula[1:-1] - - tmp_group = "" - tmp_group_count = "" - depth = 0 - comp_list = [] - comp_counts = [] - - # extract components: everything in brackets and atoms - # extract component counts: number behind component or 1 - for (i,e) in enumerate(formula): - if e == "(": - depth += 1 - if depth == 1: - if tmp_group != "": - comp_list.append(tmp_group) - tmp_group = "" - if tmp_group_count == "": - comp_counts.append(1) - else: - comp_counts.append(int(tmp_group_count)) - tmp_group_count = "" - tmp_group += e - continue - if e == ")": - depth -= 1 - tmp_group += e - continue - if depth > 0: - tmp_group += e - continue - if e.isupper(): - if tmp_group != "": - comp_list.append(tmp_group) - tmp_group = "" - if tmp_group_count == "": - comp_counts.append(1) - else: - comp_counts.append(int(tmp_group_count)) - tmp_group_count = "" - tmp_group += e - continue - if e.islower(): - tmp_group += e - continue - if e.isnumeric(): - tmp_group_count += e - if tmp_group != "": - comp_list.append(tmp_group) - if tmp_group_count == "": - comp_counts.append(1) - else: - comp_counts.append(int(tmp_group_count)) - - # assemble dictionary from component lists - atom_dict = {} - for (comp,count) in zip(comp_list,comp_counts): - if not comp.startswith("("): - atom_dict[comp] = count - else: - atom_dicts_depth = self.get_composition(comp) - for atom in atom_dicts_depth: - atom_dicts_depth[atom] *= count - if atom in atom_dict: - atom_dict[atom] += atom_dicts_depth[atom] - else: - atom_dict[atom] = atom_dicts_depth[atom] - atom_dicts_depth = {} - return atom_dict - -class BufferGas(ChemicalCompound): - - def __init__(self, formula: str): - super().__init__(formula) - From 939b3292ee2b85b42943805ecaea0c7cd0f46126 Mon Sep 17 00:00:00 2001 From: "Tim Maier (Ubuntu Desktop)" Date: Thu, 9 Nov 2023 11:58:26 +0100 Subject: [PATCH 6/7] removed duplicated files --- imspy/imspy/frame.py | 391 --------------------------------------- imspy/imspy/handle.py | 157 ---------------- imspy/imspy/slice.py | 394 ---------------------------------------- imspy/imspy/spectrum.py | 377 -------------------------------------- 4 files changed, 1319 deletions(-) delete mode 100644 imspy/imspy/frame.py delete mode 100644 imspy/imspy/handle.py delete mode 100644 imspy/imspy/slice.py delete mode 100644 imspy/imspy/spectrum.py diff --git a/imspy/imspy/frame.py b/imspy/imspy/frame.py deleted file mode 100644 index 659209b0..00000000 --- a/imspy/imspy/frame.py +++ /dev/null @@ -1,391 +0,0 @@ -import pandas as pd - -from typing import List -from numpy.typing import NDArray - -from tensorflow import sparse as sp - -import numpy as np -import imspy_connector as pims -from imspy.spectrum import MzSpectrum, TimsSpectrum, IndexedMzSpectrum - -from imspy.utilities import re_index_indices - - -class TimsFrame: - def __init__(self, frame_id: int, ms_type: int, retention_time: float, scan: NDArray[np.int32], - mobility: NDArray[np.float64], tof: NDArray[np.int32], - mz: NDArray[np.float64], intensity: NDArray[np.float64]): - """TimsFrame class. - - Args: - frame_id (int): Frame ID. - ms_type (int): MS type. - retention_time (float): Retention time. - scan (NDArray[np.int32]): Scan. - mobility (NDArray[np.float64]): Inverse mobility. - tof (NDArray[np.int32]): Time of flight. - mz (NDArray[np.float64]): m/z. - intensity (NDArray[np.float64]): Intensity. - - Raises: - AssertionError: If the length of the scan, mobility, tof, mz and intensity arrays are not equal. - """ - - assert len(scan) == len(mobility) == len(tof) == len(mz) == len(intensity), \ - "The length of the scan, mobility, tof, mz and intensity arrays must be equal." - - self.__frame_ptr = pims.PyTimsFrame(frame_id, ms_type, retention_time, scan, mobility, tof, mz, intensity) - - @classmethod - def from_py_tims_frame(cls, frame: pims.PyTimsFrame): - """Create a TimsFrame from a PyTimsFrame. - - Args: - frame (pims.PyTimsFrame): PyTimsFrame to create the TimsFrame from. - - Returns: - TimsFrame: TimsFrame created from the PyTimsFrame. - """ - instance = cls.__new__(cls) - instance.__frame_ptr = frame - return instance - - @property - def frame_id(self) -> int: - """Frame ID. - - Returns: - int: Frame ID. - """ - return self.__frame_ptr.frame_id - - @property - def ms_type(self) -> str: - """MS type. - - Returns: - int: MS type. - """ - return self.__frame_ptr.ms_type_as_string - - @property - def retention_time(self) -> float: - """Retention time. - - Returns: - float: Retention time. - """ - return self.__frame_ptr.retention_time - - @property - def scan(self) -> NDArray[np.int32]: - """Scan. - - Returns: - NDArray[np.int32]: Scan. - """ - return self.__frame_ptr.scan - - @property - def mobility(self) -> NDArray[np.float64]: - """Inverse mobility. - - Returns: - NDArray[np.float64]: Inverse mobility. - """ - return self.__frame_ptr.mobility - - @property - def tof(self) -> NDArray[np.int32]: - """Time of flight. - - Returns: - NDArray[np.int32]: Time of flight. - """ - return self.__frame_ptr.tof - - @property - def mz(self) -> NDArray[np.float64]: - """m/z. - - Returns: - NDArray[np.float64]: m/z. - """ - return self.__frame_ptr.mz - - @property - def intensity(self) -> NDArray[np.float64]: - """Intensity. - - Returns: - NDArray[np.float64]: Intensity. - """ - return self.__frame_ptr.intensity - - @property - def df(self) -> pd.DataFrame: - """ Data as a pandas DataFrame. - - Returns: - pd.DataFrame: Data. - """ - - return pd.DataFrame({ - 'frame': np.repeat(self.frame_id, len(self.scan)), - 'retention_time': np.repeat(self.retention_time, len(self.scan)), - 'scan': self.scan, - 'mobility': self.mobility, - 'tof': self.tof, - 'mz': self.mz, - 'intensity': self.intensity}) - - def filter(self, - mz_min: float = 0.0, - mz_max: float = 2000.0, - scan_min: int = 0, - scan_max: int = 1000, - mobility_min: float = 0.0, - mobility_max: float = 2.0, - intensity_min: float = 0.0, - intensity_max: float = 1e9, - ) -> 'TimsFrame': - """Filter the frame for a given m/z range, scan range and intensity range. - - Args: - mz_min (float): Minimum m/z value. - mz_max (float): Maximum m/z value. - scan_min (int, optional): Minimum scan value. Defaults to 0. - scan_max (int, optional): Maximum scan value. Defaults to 1000. - mobility_min (float, optional): Minimum inverse mobility value. Defaults to 0.0. - mobility_max (float, optional): Maximum inverse mobility value. Defaults to 2.0. - intensity_min (float, optional): Minimum intensity value. Defaults to 0.0. - intensity_max (float, optional): Maximum intensity value. Defaults to 1e9. - - Returns: - TimsFrame: Filtered frame. - """ - - return TimsFrame.from_py_tims_frame(self.__frame_ptr.filter_ranged(mz_min, mz_max, scan_min, scan_max, mobility_min, mobility_max, - intensity_min, intensity_max)) - - def to_indexed_mz_spectrum(self) -> 'IndexedMzSpectrum': - """Convert the frame to an IndexedMzSpectrum. - - Returns: - IndexedMzSpectrum: IndexedMzSpectrum. - """ - return IndexedMzSpectrum.from_py_indexed_mz_spectrum(self.__frame_ptr.to_indexed_mz_spectrum()) - - def to_resolution(self, resolution: int) -> 'TimsFrame': - """Convert the frame to a given resolution. - - Args: - resolution (int): Resolution. - - Returns: - TimsFrame: Frame with the given resolution. - """ - return TimsFrame.from_py_tims_frame(self.__frame_ptr.to_resolution(resolution)) - - def vectorized(self, resolution: int = 2) -> 'TimsFrameVectorized': - """Convert the frame to a vectorized frame. - - Args: - resolution (int, optional): Resolution. Defaults to 2. - - Returns: - TimsFrameVectorized: Vectorized frame. - """ - return TimsFrameVectorized.from_py_tims_frame_vectorized(self.__frame_ptr.vectorized(resolution)) - - def to_tims_spectra(self) -> List['TimsSpectrum']: - """Convert the frame to a list of TimsSpectrum. - - Returns: - List[TimsSpectrum]: List of TimsSpectrum. - """ - return [TimsSpectrum.from_py_tims_spectrum(spec) for spec in self.__frame_ptr.to_tims_spectra()] - - def to_windows(self, window_length: float = 10, overlapping: bool = True, min_num_peaks: int = 5, - min_intensity: float = 1) -> List[MzSpectrum]: - """Convert the frame to a list of windows. - - Args: - window_length (float, optional): Window length. Defaults to 10. - overlapping (bool, optional): Whether the windows should overlap. Defaults to True. - min_num_peaks (int, optional): Minimum number of peaks in a window. Defaults to 5. - min_intensity (float, optional): Minimum intensity of a peak in a window. Defaults to 1. - - Returns: - List[MzSpectrum]: List of windows. - """ - return [MzSpectrum.from_py_mz_spectrum(spec) for spec in self.__frame_ptr.to_windows( - window_length, overlapping, min_num_peaks, min_intensity)] - - def __repr__(self): - return (f"TimsFrame(frame_id={self.__frame_ptr.frame_id}, ms_type={self.__frame_ptr.ms_type}, " - f"num_peaks={len(self.__frame_ptr.mz)})") - - -class TimsFrameVectorized: - def __init__(self, frame_id: int, ms_type: int, retention_time: float, scan: NDArray[np.int32], - mobility: NDArray[np.float64], tof: NDArray[np.int32], - indices: NDArray[np.int32], intensity: NDArray[np.float64]): - """TimsFrameVectorized class. - - Args: - frame_id (int): Frame ID. - ms_type (int): MS type. - retention_time (float): Retention time. - scan (NDArray[np.int32]): Scan. - mobility (NDArray[np.float64]): Inverse mobility. - tof (NDArray[np.int32]): Time of flight. - indices (NDArray[np.int32]): Indices. - intensity (NDArray[np.float64]): Intensity. - - Raises: - AssertionError: If the length of the scan, mobility, tof, indices and intensity arrays are not equal. - """ - - assert len(scan) == len(mobility) == len(tof) == len(indices) == len(intensity), \ - "The length of the scan, mobility, tof, indices and intensity arrays must be equal." - - self.__frame_ptr = pims.PyTimsFrameVectorized(frame_id, ms_type, retention_time, scan, mobility, tof, indices, - intensity) - - @classmethod - def from_py_tims_frame_vectorized(cls, frame: pims.PyTimsFrameVectorized): - """Create a TimsFrameVectorized from a PyTimsFrameVectorized. - - Args: - frame (pims.PyTimsFrameVectorized): PyTimsFrameVectorized to create the TimsFrameVectorized from. - - Returns: - TimsFrameVectorized: TimsFrameVectorized created from the PyTimsFrameVectorized. - """ - instance = cls.__new__(cls) - instance.__frame_ptr = frame - return instance - - @property - def frame_id(self) -> int: - """Frame ID. - - Returns: - int: Frame ID. - """ - return self.__frame_ptr.frame_id - - @property - def ms_type(self) -> str: - """MS type. - - Returns: - int: MS type. - """ - return self.__frame_ptr.ms_type_as_string - - @property - def retention_time(self) -> float: - """Retention time. - - Returns: - float: Retention time. - """ - return self.__frame_ptr.retention_time - - @property - def scan(self) -> NDArray[np.int32]: - """Scan. - - Returns: - NDArray[np.int32]: Scan. - """ - return self.__frame_ptr.scan - - @property - def mobility(self) -> NDArray[np.float64]: - """Inverse mobility. - - Returns: - NDArray[np.float64]: Inverse mobility. - """ - return self.__frame_ptr.mobility - - @property - def tof(self) -> NDArray[np.int32]: - """Time of flight. - - Returns: - NDArray[np.int32]: Time of flight. - """ - return self.__frame_ptr.tof - - @property - def indices(self) -> NDArray[np.int32]: - """Indices. - - Returns: - NDArray[np.int32]: Indices. - """ - return self.__frame_ptr.indices - - @property - def intensity(self) -> NDArray[np.float64]: - """Intensity. - - Returns: - NDArray[np.float64]: Intensity. - """ - return self.__frame_ptr.values - - @property - def df(self) -> pd.DataFrame: - """ Data as a pandas DataFrame. - - Returns: - pd.DataFrame: Data. - """ - - return pd.DataFrame({ - 'frame': np.repeat(self.frame_id, len(self.scan)), - 'retention_time': np.repeat(self.retention_time, len(self.scan)), - 'scan': self.scan, - 'mobility': self.mobility, - 'tof': self.tof, - 'indices': self.indices, - 'intensity': self.intensity}) - - def __repr__(self): - return (f"TimsFrameVectorized(frame_id={self.__frame_ptr.frame_id}, ms_type={self.__frame_ptr.ms_type}, " - f"num_peaks={len(self.__frame_ptr.indices)})") - - def get_tensor_repr(self, dense=True, zero_indexed=True, re_index=True, scan_max=None, index_max=None): - s = self.scan - f = self.indices - i = self.intensity - - if zero_indexed: - s = s - np.min(s) - f = f - np.min(f) - - if re_index: - f = re_index_indices(f) - - if scan_max is None: - m_s = np.max(s) + 1 - else: - m_s = scan_max + 1 - - if index_max is None: - m_f = np.max(f) + 1 - else: - m_f = index_max + 1 - - sv = sp.reorder(sp.SparseTensor(indices=np.c_[s, f], values=i, dense_shape=(m_s, m_f))) - - if dense: - return sp.to_dense(sv) - else: - return sv diff --git a/imspy/imspy/handle.py b/imspy/imspy/handle.py deleted file mode 100644 index 4aec5145..00000000 --- a/imspy/imspy/handle.py +++ /dev/null @@ -1,157 +0,0 @@ -from typing import List - -import numpy as np -import pandas as pd -import sqlite3 -from numpy.typing import NDArray - -import imspy_connector as pims -import opentims_bruker_bridge as obb - -from abc import ABC - -<<<<<<<< HEAD:imspy/imspy/handle.py -from imspy.frame import TimsFrame -from imspy.slice import TimsSlice -======== -from pyims.data import TimsFrame, TimsSlice - ->>>>>>>> 1cd02c1 (transfer simulation framework from proteolizard):imspy/imspy/data/handle.py - - -class TimsDataset(ABC): - def __init__(self, data_path: str): - """TimsDataHandle class. - - Args: - data_path (str): Path to the data. - """ - self.data_path = data_path - self.bp: List[str] = obb.get_so_paths() - self.meta_data = self.__load_meta_data() - self.precursor_frames = self.meta_data[self.meta_data["MsMsType"] == 0].Id.values.astype(np.int32) - self.fragment_frames = self.meta_data[self.meta_data["MsMsType"] > 0].Id.values.astype(np.int32) - self.__handle = None - self.__current_index = 1 - - # Try to load the data with the first binary found - appropriate_found = False - for so_path in self.bp: - try: - self.__handle = pims.PyTimsDataHandle(self.data_path, so_path) - appropriate_found = True - break - except Exception: - continue - assert appropriate_found is True, ("No appropriate bruker binary could be found, please check if your " - "operating system is supported by open-tims-bruker-bridge.") - - @property - def acquisition_mode(self) -> str: - """Get the acquisition mode. - - Returns: - str: Acquisition mode. - """ - return self.__handle.get_acquisition_mode_as_string() - - @property - def acquisition_mode_numerical(self) -> int: - """Get the acquisition mode as a numerical value. - - Returns: - int: Acquisition mode as a numerical value. - """ - return self.__handle.get_acquisition_mode() - - @property - def frame_count(self) -> int: - """Get the number of frames. - - Returns: - int: Number of frames. - """ - return self.__handle.frame_count - - def __load_meta_data(self) -> pd.DataFrame: - """Get the meta data. - - Returns: - pd.DataFrame: Meta data. - """ - return pd.read_sql_query("SELECT * from Frames", sqlite3.connect(self.data_path + "/analysis.tdf")) - - def get_tims_frame(self, frame_id: int) -> TimsFrame: - """Get a TimsFrame. - - Args: - frame_id (int): Frame ID. - - Returns: - TimsFrame: TimsFrame. - """ - return TimsFrame.from_py_tims_frame(self.__handle.get_tims_frame(frame_id)) - - def get_tims_slice(self, frame_ids: NDArray[np.int32]) -> TimsSlice: - """Get a TimsFrame. - - Args: - frame_ids (int): Frame ID. - - Returns: - TimsFrame: TimsFrame. - """ - return TimsSlice.from_py_tims_slice(self.__handle.get_tims_slice(frame_ids)) - - def __iter__(self): - return self - - def __next__(self): - if self.__current_index <= self.frame_count: - frame_ptr = self.__handle.get_tims_frame(self.__current_index) - self.__current_index += 1 - if frame_ptr is not None: - return TimsFrame.from_py_tims_frame(frame_ptr) - else: - raise ValueError(f"Frame pointer is None for valid index: {self.__current_index}") - else: - self.__current_index = 1 # Reset for next iteration - raise StopIteration - - def __getitem__(self, index): - if isinstance(index, slice): - return self.get_tims_slice(np.arange(index.start, index.stop, index.step).astype(np.int32)) - return self.get_tims_frame(index) - - -class TimsDatasetDDA(TimsDataset): - @property - def selected_precursors(self): - """Get precursors selected for fragmentation. - - Returns: - pd.DataFrame: Precursors selected for fragmentation. - """ - return pd.read_sql_query("SELECT * from Precursors", sqlite3.connect(self.data_path + "/analysis.tdf")) - - @property - def pasef_meta_data(self): - """Get PASEF meta data for DDA. - - Returns: - pd.DataFrame: PASEF meta data. - """ - return pd.read_sql_query("SELECT * from PasefFrameMsMsInfo", - sqlite3.connect(self.data_path + "/analysis.tdf")) - - -class TimsDatasetDIA(TimsDataset): - @property - def pasef_meta_data(self): - """Get PASEF meta data for DIA. - - Returns: - pd.DataFrame: PASEF meta data. - """ - return pd.read_sql_query("SELECT * from DiaFrameMsMsWindows", - sqlite3.connect(self.data_path + "/analysis.tdf")) diff --git a/imspy/imspy/slice.py b/imspy/imspy/slice.py deleted file mode 100644 index 056febaa..00000000 --- a/imspy/imspy/slice.py +++ /dev/null @@ -1,394 +0,0 @@ -import numpy as np -import pandas as pd -from typing import List - -from numpy.typing import NDArray -from tensorflow import sparse as sp - -from imspy.utilities import re_index_indices - -<<<<<<<< HEAD:imspy/imspy/slice.py -import imspy_connector as pims -from imspy.frame import TimsFrame, TimsFrameVectorized -from imspy.spectrum import MzSpectrum -======== -import pyims_connector as pims -from pyims.data.frame import TimsFrame, TimsFrameVectorized -from pyims.data.spectrum import MzSpectrum ->>>>>>>> 1cd02c1 (transfer simulation framework from proteolizard):imspy/imspy/data/slice.py - - -class TimsSlice: - def __init__(self, - frame_id: NDArray[np.int32], - scan: NDArray[np.int32], - tof: NDArray[np.int32], - retention_time: NDArray[np.float64], - mobility: NDArray[np.float64], - mz: NDArray[np.float64], - intensity: NDArray[np.float64]): - """Create a TimsSlice. - - Args: - frame_id (NDArray[np.int32]): Frame ID. - scan (NDArray[np.int32]): Scan. - tof (NDArray[np.int32]): TOF. - retention_time (NDArray[np.float64]): Retention time. - mobility (NDArray[np.float64]): Mobility. - mz (NDArray[np.float64]): m/z. - intensity (NDArray[np.float64]): Intensity. - """ - - assert len(frame_id) == len(scan) == len(tof) == len(retention_time) == len(mobility) == len(mz) == len( - intensity), "All arrays must have the same length." - - self.__slice_ptr = pims.PyTimsSlice( - frame_id, scan, tof, retention_time, mobility, mz, intensity - ) - self.__current_index = 0 - - @classmethod - def from_py_tims_slice(cls, tims_slice: pims.PyTimsSlice): - """Create a TimsSlice from a PyTimsSlice. - - Args: - tims_slice (pims.PyTimsSlice): PyTimsSlice to create the TimsSlice from. - - Returns: - TimsSlice: TimsSlice created from the PyTimsSlice. - """ - instance = cls.__new__(cls) - instance.__slice_ptr = tims_slice - instance.__current_index = 0 - return instance - - @property - def first_frame_id(self) -> int: - """First frame ID. - - Returns: - int: First frame ID. - """ - return self.__slice_ptr.first_frame_id - - @property - def last_frame_id(self) -> int: - """Last frame ID. - - Returns: - int: Last frame ID. - """ - return self.__slice_ptr.last_frame_id - - def __repr__(self): - return f"TimsSlice({self.first_frame_id}, {self.last_frame_id})" - - @property - def precursors(self): - return TimsSlice.from_py_tims_slice(self.__slice_ptr.get_precursors()) - - @property - def fragments(self): - return TimsSlice.from_py_tims_slice(self.__slice_ptr.get_fragments_dda()) - - def filter(self, mz_min: float = 0.0, mz_max: float = 2000.0, scan_min: int = 0, scan_max: int = 1000, - mobility_min: float = 0.0, - mobility_max: float = 2.0, - intensity_min: float = 0.0, intensity_max: float = 1e9, num_threads: int = 4) -> 'TimsSlice': - """Filter the slice by m/z, scan and intensity. - - Args: - mz_min (float): Minimum m/z value. - mz_max (float): Maximum m/z value. - scan_min (int, optional): Minimum scan value. Defaults to 0. - scan_max (int, optional): Maximum scan value. Defaults to 1000. - mobility_min (float, optional): Minimum inverse mobility value. Defaults to 0.0. - mobility_max (float, optional): Maximum inverse mobility value. Defaults to 2.0. - intensity_min (float, optional): Minimum intensity value. Defaults to 0.0. - intensity_max (float, optional): Maximum intensity value. Defaults to 1e9. - num_threads (int, optional): Number of threads to use. Defaults to 4. - - Returns: - TimsSlice: Filtered slice. - """ - return TimsSlice.from_py_tims_slice( - self.__slice_ptr.filter_ranged(mz_min, mz_max, scan_min, scan_max, mobility_min, mobility_max, - intensity_min, intensity_max, num_threads)) - - @property - def frames(self) -> List[TimsFrame]: - """Get the frames. - - Returns: - List[TimsFrame]: Frames. - """ - return [TimsFrame.from_py_tims_frame(frame) for frame in self.__slice_ptr.get_frames()] - - def to_resolution(self, resolution: int, num_threads: int = 4) -> 'TimsSlice': - """Convert the slice to a given resolution. - - Args: - resolution (int): Resolution. - num_threads (int, optional): Number of threads to use. Defaults to 4. - - Returns: - TimsSlice: Slice with given resolution. - """ - return TimsSlice.from_py_tims_slice(self.__slice_ptr.to_resolution(resolution, num_threads)) - - def to_windows(self, window_length: float = 10, overlapping: bool = True, min_num_peaks: int = 5, - min_intensity: float = 1, num_threads: int = 4) -> List[MzSpectrum]: - """Convert the slice to a list of windows. - - Args: - window_length (float, optional): Window length. Defaults to 10. - overlapping (bool, optional): Whether the windows should overlap. Defaults to True. - min_num_peaks (int, optional): Minimum number of peaks in a window. Defaults to 5. - min_intensity (float, optional): Minimum intensity of a peak in a window. Defaults to 1. - num_threads (int, optional): Number of threads to use. Defaults to 1. - - Returns: - List[MzSpectrum]: List of windows. - """ - return [MzSpectrum.from_py_mz_spectrum(spec) for spec in self.__slice_ptr.to_windows( - window_length, overlapping, min_num_peaks, min_intensity, num_threads)] - - @property - def df(self) -> pd.DataFrame: - """Get the data as a pandas DataFrame. - - Returns: - pd.DataFrame: Data. - """ - columns = ['frame', 'scan', 'tof', 'retention_time', 'mobility', 'mz', 'intensity'] - return pd.DataFrame({c: v for c, v in zip(columns, self.__slice_ptr.to_arrays())}) - - def __iter__(self): - return self - - def __next__(self): - if self.__current_index < self.__slice_ptr.frame_count: - frame_ptr = self.__slice_ptr.get_frame_at_index(self.__current_index) - self.__current_index += 1 - if frame_ptr is not None: - return TimsFrame.from_py_tims_frame(frame_ptr) - else: - raise ValueError("Frame pointer is None for valid index.") - else: - self.__current_index = 0 # Reset for next iteration - raise StopIteration - - def vectorized(self, resolution: int = 2, num_threads: int = 4) -> 'TimsSliceVectorized': - """Get a vectorized version of the slice. - - Args: - resolution (int, optional): Resolution. Defaults to 2. - num_threads (int, optional): Number of threads to use. Defaults to 4. - - Returns: - TimsSliceVectorized: Vectorized version of the slice. - """ - return TimsSliceVectorized.from_vectorized_py_tims_slice(self.__slice_ptr.vectorized(resolution, num_threads)) - - def get_tims_planes(self, tof_max_value: int = 400_000, num_chunks: int = 7, num_threads: int = 4) -> List[ - 'TimsPlane']: - return [TimsPlane.from_py_tims_plane(plane) for plane in - self.__slice_ptr.to_tims_planes(tof_max_value, num_chunks, num_threads)] - - -class TimsSliceVectorized: - def __init__(self): - self.__slice_ptr = None - self.__current_index = 0 - - @classmethod - def from_vectorized_py_tims_slice(cls, tims_slice: pims.PyTimsSliceVectorized): - """Create a TimsSlice from a PyTimsSlice. - - Args: - tims_slice (pims.PyTimsSlice): PyTimsSlice to create the TimsSlice from. - - Returns: - TimsSlice: TimsSlice created from the PyTimsSlice. - """ - instance = cls.__new__(cls) - instance.__slice_ptr = tims_slice - instance.__current_index = 0 - return instance - - @property - def first_frame_id(self) -> int: - """First frame ID. - - Returns: - int: First frame ID. - """ - return self.__slice_ptr.first_frame_id - - @property - def last_frame_id(self) -> int: - """Last frame ID. - - Returns: - int: Last frame ID. - """ - return self.__slice_ptr.last_frame_id - - @property - def precursors(self): - return TimsSlice.from_py_tims_slice(self.__slice_ptr.get_precursors()) - - @property - def fragments(self): - return TimsSlice.from_py_tims_slice(self.__slice_ptr.get_fragments_dda()) - - @property - def frames(self) -> List[TimsFrameVectorized]: - """Get the frames. - - Returns: - List[TimsFrame]: Frames. - """ - return [TimsFrameVectorized.from_py_tims_frame_vectorized(frame) for frame in - self.__slice_ptr.get_vectorized_frames()] - - @property - def df(self) -> pd.DataFrame: - """Get the data as a pandas DataFrame. - - Returns: - pd.DataFrame: Data. - """ - columns = ['frame', 'scan', 'tof', 'retention_time', 'mobility', 'index', 'intensity'] - return pd.DataFrame({c: v for c, v in zip(columns, self.__slice_ptr.to_arrays())}) - - def __iter__(self): - return self - - def __next__(self): - if self.__current_index < self.__slice_ptr.frame_count: - frame_ptr = self.__slice_ptr.get_frame_at_index(self.__current_index) - self.__current_index += 1 - if frame_ptr is not None: - return TimsFrameVectorized.from_py_tims_frame_vectorized(frame_ptr) - else: - raise ValueError("Frame pointer is None for valid index.") - else: - self.__current_index = 0 - raise StopIteration - - def __repr__(self): - return f"TimsSliceVectorized({self.first_frame_id}, {self.last_frame_id})" - - def get_tensor_repr(self, dense=True, zero_index=True, re_index=True, frame_max=None, scan_max=None, - index_max=None): - - frames, scans, _, _, _, indices, intensities = self.__slice_ptr.to_arrays() - - if zero_index: - scans = scans - np.min(scans) - frames = frames - np.min(frames) - indices = indices - np.min(indices) - - if re_index: - frames = re_index_indices(frames) - - if scan_max is None: - m_s = np.max(scans) + 1 - else: - m_s = scan_max + 1 - - if index_max is None: - m_i = np.max(indices) + 1 - else: - m_i = index_max + 1 - - if frame_max is None: - m_f = np.max(frames) + 1 - else: - m_f = frame_max + 1 - - sv = sp.reorder( - sp.SparseTensor(indices=np.c_[frames, scans, indices], values=intensities, dense_shape=(m_f, m_s, m_i))) - - if dense: - return sp.to_dense(sv) - else: - return sv - - -class TimsPlane: - def __init__(self): - self.__plane_ptr = None - - @classmethod - def from_py_tims_plane(cls, plane: pims.PyTimsPlane): - """Create a TimsPlane from a PyTimsPlane. - - Args: - plane (pims.PyTimsPlane): PyTimsPlane to create the TimsPlane from. - - Returns: - TimsPlane: TimsPlane created from the PyTimsPlane. - """ - instance = cls.__new__(cls) - instance.__plane_ptr = plane - return instance - - @property - def mz_mean(self): - return self.__plane_ptr.mz_mean - - @property - def mz_std(self): - return self.__plane_ptr.mz_std - - @property - def tof_mean(self): - return self.__plane_ptr.tof_mean - - @property - def tof_std(self): - return self.__plane_ptr.tof_std - - @property - def frame_ids(self): - return self.__plane_ptr.frame_ids - - @property - def scans(self): - return self.__plane_ptr.scans - - @property - def intensities(self): - return self.__plane_ptr.intensity - - @property - def retention_times(self): - return self.__plane_ptr.retention_times - - @property - def mobilities(self): - return self.__plane_ptr.mobilities - - @property - def num_points(self): - return len(self.frame_ids) - - @property - def df(self): - return pd.DataFrame({ - 'frame': self.frame_ids, - 'scan': self.scans, - 'retention_time': self.retention_times, - 'mobility': self.mobilities, - 'intensity': self.intensities - }) - - def __repr__(self): - return (f"TimsPlane(mz_mean: " - f"{np.round(self.mz_mean, 4)}, " - f"mz_std: {np.round(self.mz_std, 4)}," - f" tof_mean: {np.round(self.tof_mean, 4)}, " - f"tof_std: {np.round(self.tof_std, 4)}, " - f"num_points: {len(self.frame_ids)})") diff --git a/imspy/imspy/spectrum.py b/imspy/imspy/spectrum.py deleted file mode 100644 index 80baa757..00000000 --- a/imspy/imspy/spectrum.py +++ /dev/null @@ -1,377 +0,0 @@ -import numpy as np -from typing import List, Tuple - -import pandas as pd -from numpy.typing import NDArray - -import imspy_connector as pims - - -class IndexedMzSpectrum: - def __init__(self, index: NDArray[np.int32], mz: NDArray[np.float64], intensity: NDArray[np.float64]): - """IndexedMzSpectrum class. - - Args: - index (NDArray[np.int32]): Index. - mz (NDArray[np.float64]): m/z. - intensity (NDArray[np.float64]): Intensity. - - Raises: - AssertionError: If the length of the index, mz and intensity arrays are not equal. - """ - assert len(index) == len(mz) == len(intensity), ("The length of the index, mz and intensity arrays must be " - "equal.") - self.__spec_ptr = pims.PyIndexedMzSpectrum(index, mz, intensity) - - @classmethod - def from_py_indexed_mz_spectrum(cls, spec: pims.PyIndexedMzSpectrum): - """Create a IndexedMzSpectrum from a PyIndexedMzSpectrum. - - Args: - spec (pims.PyIndexedMzSpectrum): PyIndexedMzSpectrum to create the IndexedMzSpectrum from. - - Returns: - IndexedMzSpectrum: IndexedMzSpectrum created from the PyIndexedMzSpectrum. - """ - instance = cls.__new__(cls) - instance.__spec_ptr = spec - return instance - - @property - def index(self) -> NDArray[np.int32]: - """Index. - - Returns: - NDArray[np.int32]: Index. - """ - return self.__spec_ptr.index - - @property - def mz(self) -> NDArray[np.float64]: - """m/z. - - Returns: - NDArray[np.float64]: m/z. - """ - return self.__spec_ptr.mz - - @property - def intensity(self) -> NDArray[np.float64]: - """Intensity. - - Returns: - NDArray[np.float64]: Intensity. - """ - return self.__spec_ptr.intensity - - @property - def df(self) -> pd.DataFrame: - """Data. - - Returns: - pd.DataFrame: Data. - """ - - return pd.DataFrame({'index': self.index, 'mz': self.mz, 'intensity': self.intensity}) - - def __repr__(self): - return f"IndexedMzSpectrum(num_peaks={len(self.index)})" - - -class MzSpectrum: - def __init__(self, mz: NDArray[np.float64], intensity: NDArray[np.float64]): - """MzSpectrum class. - - Args: - mz (NDArray[np.float64]): m/z. - intensity (NDArray[np.float64]): Intensity. - - Raises: - AssertionError: If the length of the mz and intensity arrays are not equal. - """ - assert len(mz) == len(intensity), "The length of the mz and intensity arrays must be equal." - self.__spec_ptr = pims.PyMzSpectrum(mz, intensity) - - @property - def mz(self) -> NDArray[np.float64]: - """m/z. - - Returns: - NDArray[np.float64]: m/z. - """ - return self.__spec_ptr.mz - - @property - def intensity(self) -> NDArray[np.float64]: - """Intensity. - - Returns: - NDArray[np.float64]: Intensity. - """ - return self.__spec_ptr.intensity - - @property - def df(self) -> pd.DataFrame: - """Data. - - Returns: - pd.DataFrame: Data. - """ - - return pd.DataFrame({'mz': self.mz, 'intensity': self.intensity}) - - @classmethod - def from_py_mz_spectrum(cls, spec: pims.PyMzSpectrum): - """Create a MzSpectrum from a PyMzSpectrum. - - Args: - spec (pims.PyMzSpectrum): PyMzSpectrum to create the MzSpectrum from. - - Returns: - MzSpectrum: MzSpectrum created from the PyMzSpectrum. - """ - instance = cls.__new__(cls) - instance.__spec_ptr = spec - return instance - - def __repr__(self): - return f"MzSpectrum(num_peaks={len(self.mz)})" - - def to_windows(self, window_length: float = 10, overlapping: bool = True, min_num_peaks: int = 5, - min_intensity: float = 1) -> Tuple[NDArray, List['MzSpectrum']]: - """Convert the spectrum to a list of windows. - - Args: - window_length (float, optional): Window length. Defaults to 10. - overlapping (bool, optional): Whether the windows should overlap. Defaults to True. - min_num_peaks (int, optional): Minimum number of peaks in a window. Defaults to 5. - min_intensity (float, optional): Minimum intensity of a peak in a window. Defaults to 1. - - Returns: - Tuple[NDArray, List[MzSpectrum]]: List of windows. - """ - - indices, windows = self.__spec_ptr.to_windows(window_length, overlapping, min_num_peaks, min_intensity) - return indices, [MzSpectrum.from_py_mz_spectrum(window) for window in windows] - - def filter(self, mz_min: float = 0.0, mz_max: float = 2000.0, intensity_min: float = 0.0, - intensity_max: float = 1e9) -> 'MzSpectrum': - """Filter the spectrum for a given m/z range and intensity range. - - Args: - mz_min (float): Minimum m/z value. - mz_max (float): Maximum m/z value. - intensity_min (float, optional): Minimum intensity value. Defaults to 0.0. - intensity_max (float, optional): Maximum intensity value. Defaults to 1e9. - - Returns: - MzSpectrum: Filtered spectrum. - """ - return MzSpectrum.from_py_mz_spectrum( - self.__spec_ptr.filter_ranged(mz_min, mz_max, intensity_min, intensity_max)) - - def vectorized(self, resolution: int = 2) -> 'MzSpectrumVectorized': - """Convert the spectrum to a vectorized spectrum. - - Args: - resolution (int, optional): Resolution. Defaults to 2. - - Returns: - MzSpectrumVectorized: Vectorized spectrum. - """ - return MzSpectrumVectorized.from_py_mz_spectrum_vectorized(self.__spec_ptr.vectorized(resolution)) - - -class MzSpectrumVectorized: - def __init__(self, indices: NDArray[np.int32], values: NDArray[np.float64], resolution: int): - """MzSpectrum class. - - Args: - mz (NDArray[np.float64]): m/z. - values (NDArray[np.float64]): Intensity. - - Raises: - AssertionError: If the length of the mz and intensity arrays are not equal. - """ - assert len(indices) == len(values), "The length of the mz and intensity arrays must be equal." - self.__spec_ptr = pims.PyMzSpectrumVectorized(indices, values, resolution) - - @classmethod - def from_py_mz_spectrum_vectorized(cls, spec: pims.PyMzSpectrumVectorized): - """Create a MzSpectrum from a PyMzSpectrum. - - Args: - spec (pims.PyMzSpectrum): PyMzSpectrum to create the MzSpectrum from. - - Returns: - MzSpectrum: MzSpectrum created from the PyMzSpectrum. - """ - instance = cls.__new__(cls) - instance.__spec_ptr = spec - return instance - - @property - def resolution(self) -> float: - """Resolution. - - Returns: - float: Resolution. - """ - return self.__spec_ptr.resolution - - @property - def indices(self) -> NDArray[np.int32]: - """m/z. - - Returns: - NDArray[np.float64]: m/z. - """ - return self.__spec_ptr.indices - - @property - def values(self) -> NDArray[np.float64]: - """Intensity. - - Returns: - NDArray[np.float64]: Intensity. - """ - return self.__spec_ptr.values - - def __repr__(self): - return f"MzSpectrumVectorized(num_values={len(self.values)})" - - -class TimsSpectrum: - def __init__(self, frame_id: int, scan: int, retention_time: float, mobility: float, ms_type: int, - index: NDArray[np.int32], mz: NDArray[np.float64], intensity: NDArray[np.float64]): - """TimsSpectrum class. - - Args: - index (NDArray[np.int32]): Index. - mz (NDArray[np.float64]): m/z. - intensity (NDArray[np.float64]): Intensity. - - Raises: - AssertionError: If the length of the index, mz and intensity arrays are not equal. - """ - assert len(index) == len(mz) == len(intensity), ("The length of the index, mz and intensity arrays must be " - "equal.") - self.__spec_ptr = pims.PyTimsSpectrum(frame_id, scan, retention_time, mobility, ms_type, index, mz, intensity) - - @classmethod - def from_py_tims_spectrum(cls, spec: pims.PyTimsSpectrum): - """Create a TimsSpectrum from a PyTimsSpectrum. - - Args: - spec (pims.PyTimsSpectrum): PyTimsSpectrum to create the TimsSpectrum from. - - Returns: - TimsSpectrum: TimsSpectrum created from the PyTimsSpectrum. - """ - instance = cls.__new__(cls) - instance.__spec_ptr = spec - return instance - - @property - def index(self) -> NDArray[np.int32]: - """Index. - - Returns: - NDArray[np.int32]: Index. - """ - return self.__spec_ptr.index - - @property - def mz(self) -> NDArray[np.float64]: - """m/z. - - Returns: - NDArray[np.float64]: m/z. - """ - return self.__spec_ptr.mz - - @property - def intensity(self) -> NDArray[np.float64]: - """Intensity. - - Returns: - NDArray[np.float64]: Intensity. - """ - return self.__spec_ptr.intensity - - @property - def ms_type(self) -> str: - """MS type. - - Returns: - str: MS type. - """ - return self.__spec_ptr.ms_type - - @property - def mobility(self) -> float: - """Inverse mobility. - - Returns: - float: Inverse mobility. - """ - return self.__spec_ptr.mobility - - @property - def scan(self) -> int: - """Scan. - - Returns: - int: Scan. - """ - return self.__spec_ptr.scan - - @property - def retention_time(self) -> float: - """Retention time. - - Returns: - float: Retention time. - """ - return self.__spec_ptr.retention_time - - @property - def frame_id(self) -> int: - """Frame ID. - - Returns: - int: Frame ID. - """ - return self.__spec_ptr.frame_id - - @property - def mz_spectrum(self) -> MzSpectrum: - """Get the MzSpectrum. - - Returns: - MzSpectrum: Spectrum. - """ - return MzSpectrum.from_py_mz_spectrum(self.__spec_ptr.mz_spectrum) - - @property - def df(self) -> pd.DataFrame: - """Data. - - Returns: - pd.DataFrame: Data. - """ - - return pd.DataFrame({ - 'frame': np.repeat(self.frame_id, len(self.index)), - 'retention_time': np.repeat(self.retention_time, len(self.index)), - 'scan': np.repeat(self.scan, len(self.index)), - 'mobility': np.repeat(self.mobility, len(self.index)), - 'tof': self.index, - 'mz': self.mz, - 'intensity': self.intensity - }) - - def __repr__(self): - return (f"TimsSpectrum(id={self.frame_id}, retention_time={np.round(self.retention_time, 2)}, " - f"scan={self.scan}, mobility={np.round(self.mobility, 2)}, ms_type={self.ms_type}, " - f"num_peaks={len(self.index)})") From 389396e49b4ca378c7c584f9a2ae1289baac0769 Mon Sep 17 00:00:00 2001 From: "Tim Maier (Ubuntu Desktop)" Date: Mon, 13 Nov 2023 16:08:58 +0100 Subject: [PATCH 7/7] super slow centroiding simulation --- .../simulation/run_example_simulation.py | 8 ++--- imspy/imspy/data/spectrum.py | 36 +++++++++++++++++-- imspy/imspy/simulation/experiment.py | 2 +- imspy_connector/src/py_mz_spectrum.rs | 4 +++ mscore/src/mz_spectrum.rs | 29 ++++++++++++--- 5 files changed, 68 insertions(+), 11 deletions(-) diff --git a/imspy/examples/simulation/run_example_simulation.py b/imspy/examples/simulation/run_example_simulation.py index 1df9cef0..a77a2d7c 100644 --- a/imspy/examples/simulation/run_example_simulation.py +++ b/imspy/examples/simulation/run_example_simulation.py @@ -1,5 +1,5 @@ -from pyims.simulation.experiment import LcImsMsMs -from pyims.simulation.hardware_models import (NeuralChromatographyApex, +from imspy.simulation.experiment import LcImsMsMs +from imspy.simulation.hardware_models import (NeuralChromatographyApex, NormalChromatographyProfileModel, LiquidChromatography, ElectroSpray, @@ -10,8 +10,8 @@ AveragineModel, BinomialIonSource ) -from pyims.proteome import ProteinSample, Trypsin, ORGANISM -from pyims.chemistry import BufferGas +from imspy.proteome import ProteinSample, Trypsin, ORGANISM +from imspy.chemistry import BufferGas import pandas as pd import numpy as np diff --git a/imspy/imspy/data/spectrum.py b/imspy/imspy/data/spectrum.py index 2bb3a4c8..dcd29bd5 100644 --- a/imspy/imspy/data/spectrum.py +++ b/imspy/imspy/data/spectrum.py @@ -1,13 +1,29 @@ from __future__ import annotations import json import numpy as np -from typing import List, Tuple +from typing import List, Tuple, Callable import pandas as pd from numpy.typing import NDArray - +from scipy.signal import find_peaks import imspy_connector as pims +def get_peak_integral(peaks: NDArray[np.int32], peak_info: dict) -> NDArray[np.float64]: + """Calculates the integral of the peaks in a spectrum. + + Args: + peaks (NDArray[np.int32]): Peak indices. + peak_info (dict): Peak info. + + Returns: + NDArray[np.float64]: Peak integrals. + """ + integrals = np.zeros(len(peaks), dtype=np.float64) + FWHM = peak_info['widths'] + h = peak_info['prominences'] + integrals = np.sqrt(2*np.pi) * h * FWHM / (2*np.sqrt(2*np.log(2))) + return integrals + class IndexedMzSpectrum: def __init__(self, index: NDArray[np.int32], mz: NDArray[np.float64], intensity: NDArray[np.float64]): """IndexedMzSpectrum class. @@ -293,6 +309,22 @@ def values(self) -> NDArray[np.float64]: """ return self.__spec_ptr.values + def to_centroided(self, integrate_method: Callable = get_peak_integral) -> MzSpectrum: + """Convert the spectrum to a centroided spectrum. + + Returns: + MzSpectrum: Centroided spectrum. + """ + # first generate dense spectrum + dense_spectrum = MzSpectrumVectorized.from_py_mz_spectrum_vectorized(self.__spec_ptr.to_dense_spectrum(None)) + # find peaks in the dense spectrum and widths with scipy + peaks, peak_info = find_peaks(dense_spectrum.values, height=0, width=(0,0.5)) + # then get the peak integrals + integrals = integrate_method(peaks, peak_info) + # then create a new spectrum with the peak indices and the integrals + return MzSpectrum.from_py_mz_spectrum(pims.PyMzSpectrum(dense_spectrum.indices[peaks]/np.power(10,dense_spectrum.resolution), integrals)) + + def __repr__(self): return f"MzSpectrumVectorized(num_values={len(self.values)})" diff --git a/imspy/imspy/simulation/experiment.py b/imspy/imspy/simulation/experiment.py index 0f53cd20..663f7efe 100644 --- a/imspy/imspy/simulation/experiment.py +++ b/imspy/imspy/simulation/experiment.py @@ -167,7 +167,7 @@ def _assemble_frame_range(frame_range, scan_id_min, scan_id_max, default_abundan for (s_id,scan_spectra_list) in frame_dict.items(): if len(scan_spectra_list) > 0: - scan_spectrum = MzSpectrum.from_mz_spectra_list(scan_spectra_list,resolution = resolution) + scan_spectrum = MzSpectrum.from_mz_spectra_list(scan_spectra_list,resolution = resolution).vectorized(resolution=resolution).to_centroided() output_dict["mz"].append(scan_spectrum.mz.tolist()) output_dict["intensity"].append(scan_spectrum.intensity.tolist()) output_dict["scan_id"].append(s_id) diff --git a/imspy_connector/src/py_mz_spectrum.rs b/imspy_connector/src/py_mz_spectrum.rs index 0bfb1020..5c90f626 100644 --- a/imspy_connector/src/py_mz_spectrum.rs +++ b/imspy_connector/src/py_mz_spectrum.rs @@ -133,6 +133,10 @@ impl PyMzSpectrumVectorized { }) } + pub fn to_dense_spectrum(&self, max_index: Option) -> PyMzSpectrumVectorized { + PyMzSpectrumVectorized { inner: self.inner.to_dense_spectrum(max_index) } + } + #[getter] pub fn resolution(&self) -> i32 { self.inner.resolution diff --git a/mscore/src/mz_spectrum.rs b/mscore/src/mz_spectrum.rs index ef40e72a..9307a097 100644 --- a/mscore/src/mz_spectrum.rs +++ b/mscore/src/mz_spectrum.rs @@ -487,15 +487,36 @@ impl MzSpectrumVectorized { /// # Arguments /// /// * `max_index` - The maximum index for the dense vector. - pub fn to_dense(&self, max_index: usize) -> DVector { - let mut dense = DVector::zeros(max_index + 1); + + fn get_max_index(&self) -> usize { + let base: i32 = 10; + let max_mz: i32 = 2000; + let max_index: usize = (max_mz*base.pow(self.resolution as u32)) as usize; + max_index + } + pub fn to_dense(&self, max_index: Option) -> DVector { + let max_index = match max_index { + Some(max_index) => max_index, + None => self.get_max_index(), + }; + let mut dense_intensities: DVector = DVector::::zeros(max_index + 1); for (&index, &intensity) in self.indices.iter().zip(self.values.iter()) { if (index as usize) <= max_index { - dense[index as usize] = intensity; + dense_intensities[index as usize] = intensity; } } - dense + dense_intensities + } + pub fn to_dense_spectrum(&self, max_index: Option) -> MzSpectrumVectorized{ + let max_index = match max_index { + Some(max_index) => max_index, + None => self.get_max_index(), + }; + let dense_intensities: Vec = self.to_dense(Some(max_index)).data.into(); + let dense_indices: Vec = (0..=max_index).map(|i| i as i32).collect(); + let dense_spectrum: MzSpectrumVectorized = MzSpectrumVectorized { resolution: (self.resolution), indices: (dense_indices), values: (dense_intensities) }; + dense_spectrum } }