Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add embeddings to sqlite #228

Draft
wants to merge 19 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -201,7 +201,7 @@ library_creator = LibraryFilesCreator(cleaned_library_spectra,
output_directory=directory_for_library_and_models,
ms2ds_model_file_name=ms2ds_model_file_name,
s2v_model_file_name=s2v_model_file_name, )
library_creator.create_all_library_files()
library_creator.create_sqlite_file()
```

To run MS2Query on your own created library. Check out the instructions under Run MS2Query. Both command line and the code version should work.
Expand Down
6 changes: 4 additions & 2 deletions ms2query/create_new_library/add_classifire_classifications.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,10 +127,12 @@ def select_compound_classes(spectra):
if npc_results is None:
print(f"no npc annotation was found for inchikey {inchikey14}")
inchikey_results_list[i] += ["", "", "", ""]
return inchikey_results_list
compound_classes_df = _convert_to_dataframe(inchikey_results_list)
assert compound_classes_df.index.name == "inchikey", "Expected a pandas dataframe with inchikey as index name"
return compound_classes_df


def convert_to_dataframe(inchikey_results_lists)->pd.DataFrame:
def _convert_to_dataframe(inchikey_results_lists)->pd.DataFrame:
header_list = [
'inchikey', 'cf_kingdom',
'cf_superclass', 'cf_class', 'cf_subclass', 'cf_direct_parent',
Expand Down
48 changes: 7 additions & 41 deletions ms2query/create_new_library/create_sqlite_database.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,49 +10,15 @@
from tqdm import tqdm
from ms2query.create_new_library.calculate_tanimoto_scores import \
calculate_highest_tanimoto_score
from ms2query.utils import return_non_existing_file_name


def make_sqlfile_wrapper(sqlite_file_name: str,
list_of_spectra: List[Spectrum],
columns_dict: Dict[str, str] = None,
compound_classes: pd.DataFrame = None,
progress_bars: bool = True):
"""Wrapper to create sqlite file containing spectrum information needed for MS2Query

Args:
-------
sqlite_file_name:
Name of sqlite_file that should be created, if it already exists the
tables are added. If the tables in this sqlite file already exist, they
will be overwritten.
list_of_spectra:
A list with spectrum objects
columns_dict:
Dictionary with as keys columns that need to be added in addition to
the default columns and as values the datatype. The defaults columns
are spectrum_id, peaks, intensities and metadata. The additional
columns should be the same names that are in the metadata dictionary,
since these values will be automatically added in the function
add_list_of_spectra_to_sqlite.
Default = None results in the default columns.
progress_bars:
If progress_bars is True progress bars will be shown for the different
parts of the progress.
"""
sqlite_file_name = return_non_existing_file_name(sqlite_file_name)
additional_inchikey_columns = []
if compound_classes is not None:
additional_inchikey_columns = list(compound_classes.columns)
assert compound_classes.index.name == "inchikey", "Expected a pandas dataframe with inchikey as index name"

initialize_tables(sqlite_file_name, additional_metadata_columns_dict=columns_dict,
additional_inchikey_columns=additional_inchikey_columns)
fill_spectrum_data_table(sqlite_file_name, list_of_spectra, progress_bar=progress_bars)

fill_inchikeys_table(sqlite_file_name, list_of_spectra,
compound_classes=compound_classes,
progress_bars=progress_bars)
def add_dataframe_to_sqlite(sqlite_file_name,
table_name,
dataframe: pd.DataFrame):
conn = sqlite3.connect(sqlite_file_name)
dataframe.to_sql(table_name, conn, if_exists='fail', index=True, index_label="spectrumid")
conn.commit()
conn.close()


def initialize_tables(sqlite_file_name: str,
Expand Down
220 changes: 123 additions & 97 deletions ms2query/create_new_library/library_files_creator.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,11 @@
from spec2vec.vector_operations import calc_vector
from tqdm import tqdm
from ms2query.clean_and_filter_spectra import create_spectrum_documents
from ms2query.create_new_library.add_classifire_classifications import (
convert_to_dataframe, select_compound_classes)
from ms2query.create_new_library.create_sqlite_database import \
make_sqlfile_wrapper
from ms2query.create_new_library.add_classifire_classifications import \
select_compound_classes
from ms2query.create_new_library.create_sqlite_database import (
add_dataframe_to_sqlite, fill_inchikeys_table, fill_spectrum_data_table,
initialize_tables)


class LibraryFilesCreator:
Expand Down Expand Up @@ -47,10 +48,10 @@ class LibraryFilesCreator:
"""
def __init__(self,
library_spectra: List[Spectrum],
output_directory: Union[str, Path],
sqlite_file_name: Union[str, Path],
s2v_model_file_name: str = None,
ms2ds_model_file_name: str = None,
add_compound_classes: bool = True
compound_classes: Union[bool, pd.DataFrame, None] = True
):
"""Creates files needed to run queries on a library

Expand All @@ -70,108 +71,133 @@ def __init__(self,
File name of a ms2ds model
"""
# pylint: disable=too-many-arguments
self.progress_bars = True
self.output_directory = output_directory
if not os.path.exists(self.output_directory):
os.mkdir(self.output_directory)
self.sqlite_file_name = os.path.join(output_directory, "ms2query_library.sqlite")
self.ms2ds_embeddings_file_name = os.path.join(output_directory, "ms2ds_embeddings.pickle")
self.s2v_embeddings_file_name = os.path.join(output_directory, "s2v_embeddings.pickle")
# These checks are performed at the start, since the filtering of spectra can take long
self._check_for_existing_files()
if os.path.exists(sqlite_file_name):
raise FileExistsError("The sqlite file already exists")
self.sqlite_file_name = sqlite_file_name

# Load in spec2vec model
if s2v_model_file_name is None:
self.s2v_model = None
else:
assert os.path.exists(s2v_model_file_name), "Spec2Vec model file does not exists"
if os.path.exists(s2v_model_file_name):
self.s2v_model = Word2Vec.load(s2v_model_file_name)
# load in ms2ds model
if ms2ds_model_file_name is None:
self.ms2ds_model = None
else:
assert os.path.exists(ms2ds_model_file_name), "MS2Deepscore model file does not exists"
raise FileNotFoundError("Spec2Vec model file does not exists")
# load in ms2ds model
if os.path.exists(ms2ds_model_file_name):
self.ms2ds_model = load_ms2ds_model(ms2ds_model_file_name)
else:
raise FileNotFoundError("MS2Deepscore model file does not exists")
# Initialise spectra
self.list_of_spectra = library_spectra

# Run default filters
self.list_of_spectra = [msfilters.default_filters(s) for s in tqdm(self.list_of_spectra,
desc="Applying default filters to spectra")]
self.add_compound_classes = add_compound_classes

def _check_for_existing_files(self):
assert not os.path.exists(self.sqlite_file_name), \
f"The file {self.sqlite_file_name} already exists," \
f" choose a different output_base_filename"
assert not os.path.exists(self.ms2ds_embeddings_file_name), \
f"The file {self.ms2ds_embeddings_file_name} " \
f"already exists, choose a different output_base_filename"
assert not os.path.exists(self.s2v_embeddings_file_name), \
f"The file {self.s2v_embeddings_file_name} " \
f"already exists, choose a different output_base_filename"

def create_all_library_files(self):
"""Creates files with embeddings and a sqlite file with spectra data
"""
self.create_sqlite_file()
self.store_s2v_embeddings()
self.store_ms2ds_embeddings()
self.compound_classes = self.add_compound_classes(compound_classes)
if self.compound_classes is not None:
self.additional_inchikey_columns = list(compound_classes.columns)
else:
self.additional_inchikey_columns = []

def create_sqlite_file(self):
if self.add_compound_classes:
self.progress_bars = True
self.additional_metadata_columns = {"precursor_mz": "REAL"}

def add_compound_classes(self,
compound_classes: Union[pd.DataFrame, bool, None]):
"""Calculates compound classes if True, otherwise uses given compound_classes
"""
if compound_classes is True:
compound_classes = select_compound_classes(self.list_of_spectra)
compound_classes_df = convert_to_dataframe(compound_classes)
elif compound_classes is not None and isinstance(compound_classes, pd.DataFrame):
if not compound_classes.index.name == "inchikey":
raise ValueError("Expected a pandas dataframe with inchikey as index name")
elif compound_classes is False or compound_classes is None:
compound_classes = None
else:
compound_classes_df = None
make_sqlfile_wrapper(
self.sqlite_file_name,
self.list_of_spectra,
columns_dict={"precursor_mz": "REAL"},
compound_classes=compound_classes_df,
progress_bars=self.progress_bars,
)

def store_ms2ds_embeddings(self):
"""Creates a pickled file with embeddings scores for spectra

A dataframe with as index randomly generated spectrum indexes and as columns the indexes
of the vector is converted to pickle.
"""
assert not os.path.exists(self.ms2ds_embeddings_file_name), \
"Given ms2ds_embeddings_file_name already exists"
assert self.ms2ds_model is not None, "No MS2deepscore model was provided"
ms2ds = MS2DeepScore(self.ms2ds_model,
progress_bar=self.progress_bars)

# Compute spectral embeddings
embeddings = ms2ds.calculate_vectors(self.list_of_spectra)
spectrum_ids = np.arange(0, len(self.list_of_spectra))
all_embeddings_df = pd.DataFrame(embeddings, index=spectrum_ids)
all_embeddings_df.to_pickle(self.ms2ds_embeddings_file_name)

def store_s2v_embeddings(self):
"""Creates and stored a dataframe with embeddings as pickled file

A dataframe with as index randomly generated spectrum indexes and as columns the indexes
of the vector is converted to pickle.
raise ValueError("Expected a dataframe or True or None for compound classes")
return compound_classes

def create_sqlite_file(self):
"""Wrapper to create sqlite file containing spectrum information needed for MS2Query

Args:
-------
sqlite_file_name:
Name of sqlite_file that should be created, if it already exists the
tables are added. If the tables in this sqlite file already exist, they
will be overwritten.
list_of_spectra:
A list with spectrum objects
columns_dict:
Dictionary with as keys columns that need to be added in addition to
the default columns and as values the datatype. The defaults columns
are spectrum_id, peaks, intensities and metadata. The additional
columns should be the same names that are in the metadata dictionary,
since these values will be automatically added in the function
add_list_of_spectra_to_sqlite.
Default = None results in the default columns.
progress_bars:
If progress_bars is True progress bars will be shown for the different
parts of the progress.
"""
assert not os.path.exists(self.s2v_embeddings_file_name), \
"Given s2v_embeddings_file_name already exists"
assert self.s2v_model is not None, "No spec2vec model was specified"
# Convert Spectrum objects to SpectrumDocument
spectrum_documents = create_spectrum_documents(
self.list_of_spectra,
progress_bar=self.progress_bars)
embeddings_dict = {}
for spectrum_id, spectrum_document in tqdm(enumerate(spectrum_documents),
desc="Calculating embeddings",
disable=not self.progress_bars):
embedding = calc_vector(self.s2v_model,
spectrum_document,
allowed_missing_percentage=100)
embeddings_dict[spectrum_id] = embedding

# Convert to pandas Dataframe
embeddings_dataframe = pd.DataFrame.from_dict(embeddings_dict,
orient="index")
embeddings_dataframe.to_pickle(self.s2v_embeddings_file_name)
if os.path.exists(self.sqlite_file_name):
raise FileExistsError("The sqlite file already exists")
initialize_tables(self.sqlite_file_name,
additional_metadata_columns_dict=self.additional_metadata_columns,
additional_inchikey_columns=self.additional_inchikey_columns)
fill_spectrum_data_table(self.sqlite_file_name, self.list_of_spectra, progress_bar=self.progress_bars)

fill_inchikeys_table(self.sqlite_file_name, self.list_of_spectra,
compound_classes=self.compound_classes,
progress_bars=self.progress_bars)

add_dataframe_to_sqlite(self.sqlite_file_name,
'MS2Deepscore_embeddings',
create_ms2ds_embeddings(self.ms2ds_model, self.list_of_spectra, self.progress_bars), )
add_dataframe_to_sqlite(self.sqlite_file_name,
'Spec2Vec_embeddings',
create_s2v_embeddings(self.s2v_model, self.list_of_spectra, self.progress_bars))


def create_ms2ds_embeddings(ms2ds_model,
list_of_spectra,
progress_bar=True):
"""Creates the ms2deepscore embeddings for all spectra

A dataframe with as index randomly generated spectrum indexes and as columns the indexes
of the vector is converted to pickle.
"""
assert ms2ds_model is not None, "No MS2deepscore model was provided"
ms2ds = MS2DeepScore(ms2ds_model,
progress_bar=progress_bar)
# Compute spectral embeddings
embeddings = ms2ds.calculate_vectors(list_of_spectra)
spectrum_ids = np.arange(0, len(list_of_spectra))
all_embeddings_df = pd.DataFrame(embeddings, index=spectrum_ids)
return all_embeddings_df


def create_s2v_embeddings(s2v_model,
list_of_spectra,
progress_bar=True):
"""Creates and stored a dataframe with embeddings as pickled file

A dataframe with as index randomly generated spectrum indexes and as columns the indexes
of the vector is converted to pickle.
"""
assert s2v_model is not None, "No spec2vec model was specified"
# Convert Spectrum objects to SpectrumDocument
spectrum_documents = create_spectrum_documents(
list_of_spectra,
progress_bar=progress_bar)
embeddings_dict = {}
for spectrum_id, spectrum_document in tqdm(enumerate(spectrum_documents),
desc="Calculating embeddings",
disable=not progress_bar):
embedding = calc_vector(s2v_model,
spectrum_document,
allowed_missing_percentage=100)
embeddings_dict[spectrum_id] = embedding

# Convert to pandas Dataframe
embeddings_dataframe = pd.DataFrame.from_dict(embeddings_dict,
orient="index")
return embeddings_dataframe
7 changes: 4 additions & 3 deletions ms2query/create_new_library/train_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ def train_all_models(annotated_training_spectra,
spec2vec_model_file_name = os.path.join(output_folder, "spec2vec_model.model")
ms2query_model_file_name = os.path.join(output_folder, "ms2query_model.onnx")
ms2ds_history_figure_file_name = os.path.join(output_folder, "ms2deepscore_training_history.svg")
sqlite_model_file = os.path.join(output_folder, "ms2query_model.sqlite")

# Train MS2Deepscore model
train_ms2deepscore_wrapper(annotated_training_spectra,
Expand Down Expand Up @@ -75,11 +76,11 @@ def train_all_models(annotated_training_spectra,

# Create library with all training spectra
library_files_creator = LibraryFilesCreator(annotated_training_spectra,
output_folder,
sqlite_model_file,
spec2vec_model_file_name,
ms2deepscore_model_file_name,
add_compound_classes=settings.add_compound_classes)
library_files_creator.create_all_library_files()
compound_classes=settings.add_compound_classes)
library_files_creator.create_sqlite_file()


def clean_and_train_models(spectrum_file: str,
Expand Down
16 changes: 9 additions & 7 deletions ms2query/create_new_library/train_ms2query_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,8 @@ def train_ms2query_model(training_spectra,
ms2ds_model_file_name,
s2v_model_file_name,
fraction_for_training):
os.makedirs(library_files_folder, exist_ok=True)

# Select spectra belonging to a single InChIKey
library_spectra, unique_inchikey_query_spectra = split_spectra_on_inchikeys(training_spectra,
fraction_for_training)
Expand All @@ -125,17 +127,17 @@ def train_ms2query_model(training_spectra,
query_spectra_for_training = unique_inchikey_query_spectra + single_spectra_query_spectra

# Create library files for training ms2query
library_creator_for_training = LibraryFilesCreator(library_spectra, output_directory=library_files_folder,
s2v_model_file_name=s2v_model_file_name,
ms2ds_model_file_name=ms2ds_model_file_name,
add_compound_classes=False)
library_creator_for_training.create_all_library_files()
library_creator_for_training = LibraryFilesCreator(
library_spectra,
sqlite_file_name=os.path.join(library_files_folder, "ms2query_library.sqlite"),
s2v_model_file_name=s2v_model_file_name,
ms2ds_model_file_name=ms2ds_model_file_name,
compound_classes=None)
library_creator_for_training.create_sqlite_file()

ms2library_for_training = MS2Library(sqlite_file_name=library_creator_for_training.sqlite_file_name,
s2v_model_file_name=s2v_model_file_name,
ms2ds_model_file_name=ms2ds_model_file_name,
pickled_s2v_embeddings_file_name=library_creator_for_training.s2v_embeddings_file_name,
pickled_ms2ds_embeddings_file_name=library_creator_for_training.ms2ds_embeddings_file_name,
ms2query_model_file_name=None)
# Create training data MS2Query model
collector = DataCollectorForTraining(ms2library_for_training)
Expand Down
Loading
Loading