iomega · niekdejonge · Nov 23, 2023 · Nov 23, 2023 · Nov 23, 2023 · Nov 23, 2023
diff --git a/README.md b/README.md
@@ -201,7 +201,7 @@ library_creator = LibraryFilesCreator(cleaned_library_spectra,
                                       output_directory=directory_for_library_and_models,
                                       ms2ds_model_file_name=ms2ds_model_file_name,
                                       s2v_model_file_name=s2v_model_file_name, )
-library_creator.create_all_library_files()
+library_creator.create_sqlite_file()
 ```
 
 To run MS2Query on your own created library. Check out the instructions under Run MS2Query. Both command line and the code version should work.

diff --git a/ms2query/create_new_library/add_classifire_classifications.py b/ms2query/create_new_library/add_classifire_classifications.py
@@ -127,10 +127,12 @@ def select_compound_classes(spectra):
         if npc_results is None:
             print(f"no npc annotation was found for inchikey {inchikey14}")
             inchikey_results_list[i] += ["", "", "", ""]
-    return inchikey_results_list
+    compound_classes_df = _convert_to_dataframe(inchikey_results_list)
+    assert compound_classes_df.index.name == "inchikey", "Expected a pandas dataframe with inchikey as index name"
+    return compound_classes_df
 
 
-def convert_to_dataframe(inchikey_results_lists)->pd.DataFrame:
+def _convert_to_dataframe(inchikey_results_lists)->pd.DataFrame:
     header_list = [
         'inchikey', 'cf_kingdom',
         'cf_superclass', 'cf_class', 'cf_subclass', 'cf_direct_parent',

diff --git a/ms2query/create_new_library/create_sqlite_database.py b/ms2query/create_new_library/create_sqlite_database.py
@@ -10,49 +10,15 @@
 from tqdm import tqdm
 from ms2query.create_new_library.calculate_tanimoto_scores import \
     calculate_highest_tanimoto_score
-from ms2query.utils import return_non_existing_file_name
 
 
-def make_sqlfile_wrapper(sqlite_file_name: str,
-                         list_of_spectra: List[Spectrum],
-                         columns_dict: Dict[str, str] = None,
-                         compound_classes: pd.DataFrame = None,
-                         progress_bars: bool = True):
-    """Wrapper to create sqlite file containing spectrum information needed for MS2Query
-
-    Args:
-    -------
-    sqlite_file_name:
-        Name of sqlite_file that should be created, if it already exists the
-        tables are added. If the tables in this sqlite file already exist, they
-        will be overwritten.
-    list_of_spectra:
-        A list with spectrum objects
-    columns_dict:
-        Dictionary with as keys columns that need to be added in addition to
-        the default columns and as values the datatype. The defaults columns
-        are spectrum_id, peaks, intensities and metadata. The additional
-        columns should be the same names that are in the metadata dictionary,
-        since these values will be automatically added in the function
-        add_list_of_spectra_to_sqlite.
-        Default = None results in the default columns.
-    progress_bars:
-        If progress_bars is True progress bars will be shown for the different
-        parts of the progress.
-    """
-    sqlite_file_name = return_non_existing_file_name(sqlite_file_name)
-    additional_inchikey_columns = []
-    if compound_classes is not None:
-        additional_inchikey_columns = list(compound_classes.columns)
-        assert compound_classes.index.name == "inchikey", "Expected a pandas dataframe with inchikey as index name"
-
-    initialize_tables(sqlite_file_name, additional_metadata_columns_dict=columns_dict,
-                      additional_inchikey_columns=additional_inchikey_columns)
-    fill_spectrum_data_table(sqlite_file_name, list_of_spectra, progress_bar=progress_bars)
-
-    fill_inchikeys_table(sqlite_file_name, list_of_spectra,
-                         compound_classes=compound_classes,
-                         progress_bars=progress_bars)
+def add_dataframe_to_sqlite(sqlite_file_name,
+                            table_name,
+                            dataframe: pd.DataFrame):
+    conn = sqlite3.connect(sqlite_file_name)
+    dataframe.to_sql(table_name, conn, if_exists='fail', index=True, index_label="spectrumid")
+    conn.commit()
+    conn.close()
 
 
 def initialize_tables(sqlite_file_name: str,

diff --git a/ms2query/create_new_library/library_files_creator.py b/ms2query/create_new_library/library_files_creator.py
@@ -16,10 +16,11 @@
 from spec2vec.vector_operations import calc_vector
 from tqdm import tqdm
 from ms2query.clean_and_filter_spectra import create_spectrum_documents
-from ms2query.create_new_library.add_classifire_classifications import (
-    convert_to_dataframe, select_compound_classes)
-from ms2query.create_new_library.create_sqlite_database import \
-    make_sqlfile_wrapper
+from ms2query.create_new_library.add_classifire_classifications import \
+    select_compound_classes
+from ms2query.create_new_library.create_sqlite_database import (
+    add_dataframe_to_sqlite, fill_inchikeys_table, fill_spectrum_data_table,
+    initialize_tables)
 
 
 class LibraryFilesCreator:
@@ -47,10 +48,10 @@ class LibraryFilesCreator:
     """
     def __init__(self,
                  library_spectra: List[Spectrum],
-                 output_directory: Union[str, Path],
+                 sqlite_file_name: Union[str, Path],
                  s2v_model_file_name: str = None,
                  ms2ds_model_file_name: str = None,
-                 add_compound_classes: bool = True
+                 compound_classes: Union[bool, pd.DataFrame, None] = True
                  ):
         """Creates files needed to run queries on a library
 
@@ -70,108 +71,133 @@ def __init__(self,
             File name of a ms2ds model
         """
         # pylint: disable=too-many-arguments
-        self.progress_bars = True
-        self.output_directory = output_directory
-        if not os.path.exists(self.output_directory):
-            os.mkdir(self.output_directory)
-        self.sqlite_file_name = os.path.join(output_directory, "ms2query_library.sqlite")
-        self.ms2ds_embeddings_file_name = os.path.join(output_directory, "ms2ds_embeddings.pickle")
-        self.s2v_embeddings_file_name = os.path.join(output_directory, "s2v_embeddings.pickle")
-        # These checks are performed at the start, since the filtering of spectra can take long
-        self._check_for_existing_files()
+        if os.path.exists(sqlite_file_name):
+            raise FileExistsError("The sqlite file already exists")
+        self.sqlite_file_name = sqlite_file_name
+
         # Load in spec2vec model
-        if s2v_model_file_name is None:
-            self.s2v_model = None
-        else:
-            assert os.path.exists(s2v_model_file_name), "Spec2Vec model file does not exists"
+        if os.path.exists(s2v_model_file_name):
             self.s2v_model = Word2Vec.load(s2v_model_file_name)
-        # load in ms2ds model
-        if ms2ds_model_file_name is None:
-            self.ms2ds_model = None
         else:
-            assert os.path.exists(ms2ds_model_file_name), "MS2Deepscore model file does not exists"
+            raise FileNotFoundError("Spec2Vec model file does not exists")
+        # load in ms2ds model
+        if os.path.exists(ms2ds_model_file_name):
             self.ms2ds_model = load_ms2ds_model(ms2ds_model_file_name)
+        else:
+            raise FileNotFoundError("MS2Deepscore model file does not exists")
         # Initialise spectra
         self.list_of_spectra = library_spectra
 
         # Run default filters
         self.list_of_spectra = [msfilters.default_filters(s) for s in tqdm(self.list_of_spectra,
                                                                            desc="Applying default filters to spectra")]
-        self.add_compound_classes = add_compound_classes
-
-    def _check_for_existing_files(self):
-        assert not os.path.exists(self.sqlite_file_name), \
-            f"The file {self.sqlite_file_name} already exists," \
-            f" choose a different output_base_filename"
-        assert not os.path.exists(self.ms2ds_embeddings_file_name), \
-            f"The file {self.ms2ds_embeddings_file_name} " \
-            f"already exists, choose a different output_base_filename"
-        assert not os.path.exists(self.s2v_embeddings_file_name), \
-            f"The file {self.s2v_embeddings_file_name} " \
-            f"already exists, choose a different output_base_filename"
-
-    def create_all_library_files(self):
-        """Creates files with embeddings and a sqlite file with spectra data
-        """
-        self.create_sqlite_file()
-        self.store_s2v_embeddings()
-        self.store_ms2ds_embeddings()
+        self.compound_classes = self.add_compound_classes(compound_classes)
+        if self.compound_classes is not None:
+            self.additional_inchikey_columns = list(compound_classes.columns)
+        else:
+            self.additional_inchikey_columns = []
 
-    def create_sqlite_file(self):
-        if self.add_compound_classes:
+        self.progress_bars = True
+        self.additional_metadata_columns = {"precursor_mz": "REAL"}
+
+    def add_compound_classes(self,
+                             compound_classes: Union[pd.DataFrame, bool, None]):
+        """Calculates compound classes if True, otherwise uses given compound_classes
+        """
+        if compound_classes is True:
             compound_classes = select_compound_classes(self.list_of_spectra)
-            compound_classes_df = convert_to_dataframe(compound_classes)
+        elif compound_classes is not None and isinstance(compound_classes, pd.DataFrame):
+            if not compound_classes.index.name == "inchikey":
+                raise ValueError("Expected a pandas dataframe with inchikey as index name")
+        elif compound_classes is False or compound_classes is None:
+            compound_classes = None
         else:
-            compound_classes_df = None
-        make_sqlfile_wrapper(
-            self.sqlite_file_name,
-            self.list_of_spectra,
-            columns_dict={"precursor_mz": "REAL"},
-            compound_classes=compound_classes_df,
-            progress_bars=self.progress_bars,
-        )
-
-    def store_ms2ds_embeddings(self):
-        """Creates a pickled file with embeddings scores for spectra
-
-        A dataframe with as index randomly generated spectrum indexes and as columns the indexes
-        of the vector is converted to pickle.
-        """
-        assert not os.path.exists(self.ms2ds_embeddings_file_name), \
-            "Given ms2ds_embeddings_file_name already exists"
-        assert self.ms2ds_model is not None, "No MS2deepscore model was provided"
-        ms2ds = MS2DeepScore(self.ms2ds_model,
-                             progress_bar=self.progress_bars)
-
-        # Compute spectral embeddings
-        embeddings = ms2ds.calculate_vectors(self.list_of_spectra)
-        spectrum_ids = np.arange(0, len(self.list_of_spectra))
-        all_embeddings_df = pd.DataFrame(embeddings, index=spectrum_ids)
-        all_embeddings_df.to_pickle(self.ms2ds_embeddings_file_name)
-
-    def store_s2v_embeddings(self):
-        """Creates and stored a dataframe with embeddings as pickled file
-
-        A dataframe with as index randomly generated spectrum indexes and as columns the indexes
-        of the vector is converted to pickle.
+            raise ValueError("Expected a dataframe or True or None for compound classes")
+        return compound_classes
+
+    def create_sqlite_file(self):
+        """Wrapper to create sqlite file containing spectrum information needed for MS2Query
+
+        Args:
+        -------
+        sqlite_file_name:
+            Name of sqlite_file that should be created, if it already exists the
+            tables are added. If the tables in this sqlite file already exist, they
+            will be overwritten.
+        list_of_spectra:
+            A list with spectrum objects
+        columns_dict:
+            Dictionary with as keys columns that need to be added in addition to
+            the default columns and as values the datatype. The defaults columns
+            are spectrum_id, peaks, intensities and metadata. The additional
+            columns should be the same names that are in the metadata dictionary,
+            since these values will be automatically added in the function
+            add_list_of_spectra_to_sqlite.
+            Default = None results in the default columns.
+        progress_bars:
+            If progress_bars is True progress bars will be shown for the different
+            parts of the progress.
         """
-        assert not os.path.exists(self.s2v_embeddings_file_name), \
-            "Given s2v_embeddings_file_name already exists"
-        assert self.s2v_model is not None, "No spec2vec model was specified"
-        # Convert Spectrum objects to SpectrumDocument
-        spectrum_documents = create_spectrum_documents(
-            self.list_of_spectra,
-            progress_bar=self.progress_bars)
-        embeddings_dict = {}
-        for spectrum_id, spectrum_document in tqdm(enumerate(spectrum_documents),
-                                                   desc="Calculating embeddings",
-                                                   disable=not self.progress_bars):
-            embedding = calc_vector(self.s2v_model,
-                                    spectrum_document,
-                                    allowed_missing_percentage=100)
-            embeddings_dict[spectrum_id] = embedding
-
-        # Convert to pandas Dataframe
-        embeddings_dataframe = pd.DataFrame.from_dict(embeddings_dict,
-                                                      orient="index")
-        embeddings_dataframe.to_pickle(self.s2v_embeddings_file_name)
+        if os.path.exists(self.sqlite_file_name):
+            raise FileExistsError("The sqlite file already exists")
+        initialize_tables(self.sqlite_file_name,
+                          additional_metadata_columns_dict=self.additional_metadata_columns,
+                          additional_inchikey_columns=self.additional_inchikey_columns)
+        fill_spectrum_data_table(self.sqlite_file_name, self.list_of_spectra, progress_bar=self.progress_bars)
+
+        fill_inchikeys_table(self.sqlite_file_name, self.list_of_spectra,
+                             compound_classes=self.compound_classes,
+                             progress_bars=self.progress_bars)
+
+        add_dataframe_to_sqlite(self.sqlite_file_name,
+                                'MS2Deepscore_embeddings',
+                                create_ms2ds_embeddings(self.ms2ds_model, self.list_of_spectra, self.progress_bars), )
+        add_dataframe_to_sqlite(self.sqlite_file_name,
+                                'Spec2Vec_embeddings',
+                                create_s2v_embeddings(self.s2v_model, self.list_of_spectra, self.progress_bars))
+
+
+def create_ms2ds_embeddings(ms2ds_model,
+                            list_of_spectra,
+                            progress_bar=True):
+    """Creates the ms2deepscore embeddings for all spectra
+
+    A dataframe with as index randomly generated spectrum indexes and as columns the indexes
+    of the vector is converted to pickle.
+    """
+    assert ms2ds_model is not None, "No MS2deepscore model was provided"
+    ms2ds = MS2DeepScore(ms2ds_model,
+                         progress_bar=progress_bar)
+    # Compute spectral embeddings
+    embeddings = ms2ds.calculate_vectors(list_of_spectra)
+    spectrum_ids = np.arange(0, len(list_of_spectra))
+    all_embeddings_df = pd.DataFrame(embeddings, index=spectrum_ids)
+    return all_embeddings_df
+
+
+def create_s2v_embeddings(s2v_model,
+                          list_of_spectra,
+                          progress_bar=True):
+    """Creates and stored a dataframe with embeddings as pickled file
+
+    A dataframe with as index randomly generated spectrum indexes and as columns the indexes
+    of the vector is converted to pickle.
+    """
+    assert s2v_model is not None, "No spec2vec model was specified"
+    # Convert Spectrum objects to SpectrumDocument
+    spectrum_documents = create_spectrum_documents(
+        list_of_spectra,
+        progress_bar=progress_bar)
+    embeddings_dict = {}
+    for spectrum_id, spectrum_document in tqdm(enumerate(spectrum_documents),
+                                               desc="Calculating embeddings",
+                                               disable=not progress_bar):
+        embedding = calc_vector(s2v_model,
+                                spectrum_document,
+                                allowed_missing_percentage=100)
+        embeddings_dict[spectrum_id] = embedding
+
+    # Convert to pandas Dataframe
+    embeddings_dataframe = pd.DataFrame.from_dict(embeddings_dict,
+                                                  orient="index")
+    return embeddings_dataframe
diff --git a/ms2query/create_new_library/train_models.py b/ms2query/create_new_library/train_models.py
@@ -47,6 +47,7 @@ def train_all_models(annotated_training_spectra,
     spec2vec_model_file_name = os.path.join(output_folder, "spec2vec_model.model")
     ms2query_model_file_name = os.path.join(output_folder, "ms2query_model.onnx")
     ms2ds_history_figure_file_name = os.path.join(output_folder, "ms2deepscore_training_history.svg")
+    sqlite_model_file = os.path.join(output_folder, "ms2query_model.sqlite")
 
     # Train MS2Deepscore model
     train_ms2deepscore_wrapper(annotated_training_spectra,
@@ -75,11 +76,11 @@ def train_all_models(annotated_training_spectra,
 
     # Create library with all training spectra
     library_files_creator = LibraryFilesCreator(annotated_training_spectra,
-                                                output_folder,
+                                                sqlite_model_file,
                                                 spec2vec_model_file_name,
                                                 ms2deepscore_model_file_name,
-                                                add_compound_classes=settings.add_compound_classes)
-    library_files_creator.create_all_library_files()
+                                                compound_classes=settings.add_compound_classes)
+    library_files_creator.create_sqlite_file()
 
 
 def clean_and_train_models(spectrum_file: str,

diff --git a/ms2query/create_new_library/train_ms2query_model.py b/ms2query/create_new_library/train_ms2query_model.py
@@ -116,6 +116,8 @@ def train_ms2query_model(training_spectra,
                          ms2ds_model_file_name,
                          s2v_model_file_name,
                          fraction_for_training):
+    os.makedirs(library_files_folder, exist_ok=True)
+
     # Select spectra belonging to a single InChIKey
     library_spectra, unique_inchikey_query_spectra = split_spectra_on_inchikeys(training_spectra,
                                                                                 fraction_for_training)
@@ -125,17 +127,17 @@ def train_ms2query_model(training_spectra,
     query_spectra_for_training = unique_inchikey_query_spectra + single_spectra_query_spectra
 
     # Create library files for training ms2query
-    library_creator_for_training = LibraryFilesCreator(library_spectra, output_directory=library_files_folder,
-                                                       s2v_model_file_name=s2v_model_file_name,
-                                                       ms2ds_model_file_name=ms2ds_model_file_name,
-                                                       add_compound_classes=False)
-    library_creator_for_training.create_all_library_files()
+    library_creator_for_training = LibraryFilesCreator(
+        library_spectra,
+        sqlite_file_name=os.path.join(library_files_folder, "ms2query_library.sqlite"),
+        s2v_model_file_name=s2v_model_file_name,
+        ms2ds_model_file_name=ms2ds_model_file_name,
+        compound_classes=None)
+    library_creator_for_training.create_sqlite_file()
 
     ms2library_for_training = MS2Library(sqlite_file_name=library_creator_for_training.sqlite_file_name,
                                          s2v_model_file_name=s2v_model_file_name,
                                          ms2ds_model_file_name=ms2ds_model_file_name,
-                                         pickled_s2v_embeddings_file_name=library_creator_for_training.s2v_embeddings_file_name,
-                                         pickled_ms2ds_embeddings_file_name=library_creator_for_training.ms2ds_embeddings_file_name,
                                          ms2query_model_file_name=None)
     # Create training data MS2Query model
     collector = DataCollectorForTraining(ms2library_for_training)