Add embeddings to sqlite #228

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft

niekdejonge wants to merge 19 commits into main from add_embeddings_to_sqlite

README.md

-Original file line number
+Diff line change
@@ Expand Up @@
                                           output_directory=directory_for_library_and_models,
                                           ms2ds_model_file_name=ms2ds_model_file_name,
                                           s2v_model_file_name=s2v_model_file_name, )
-    library_creator.create_all_library_files()
+    library_creator.create_sqlite_file()
     ```
     To run MS2Query on your own created library. Check out the instructions under Run MS2Query. Both command line and the code version should work.
@@ Expand Down @@

ms2query/create_new_library/add_classifire_classifications.py

-Original file line number
+Diff line change
@@ Expand Up / @@ -127,10 +127,12 @@ def select_compound_classes(spectra): @@
             if npc_results is None:
                 print(f"no npc annotation was found for inchikey {inchikey14}")
                 inchikey_results_list[i] += ["", "", "", ""]
-        return inchikey_results_list
+        compound_classes_df = _convert_to_dataframe(inchikey_results_list)
+        assert compound_classes_df.index.name == "inchikey", "Expected a pandas dataframe with inchikey as index name"
+        return compound_classes_df
-    def convert_to_dataframe(inchikey_results_lists)->pd.DataFrame:
+    def _convert_to_dataframe(inchikey_results_lists)->pd.DataFrame:
         header_list = [
             'inchikey', 'cf_kingdom',
             'cf_superclass', 'cf_class', 'cf_subclass', 'cf_direct_parent',
@@ Expand Down @@

ms2query/create_new_library/create_sqlite_database.py

-Original file line number
+Diff line change
@@ Expand Up / @@ -10,49 +10,15 @@ @@
     from tqdm import tqdm
     from ms2query.create_new_library.calculate_tanimoto_scores import \
         calculate_highest_tanimoto_score
-    from ms2query.utils import return_non_existing_file_name
-    def make_sqlfile_wrapper(sqlite_file_name: str,
-                             list_of_spectra: List[Spectrum],
-                             columns_dict: Dict[str, str] = None,
-                             compound_classes: pd.DataFrame = None,
-                             progress_bars: bool = True):
-        """Wrapper to create sqlite file containing spectrum information needed for MS2Query
-        Args:
-        -------
-        sqlite_file_name:
-            Name of sqlite_file that should be created, if it already exists the
-            tables are added. If the tables in this sqlite file already exist, they
-            will be overwritten.
-        list_of_spectra:
-            A list with spectrum objects
-        columns_dict:
-            Dictionary with as keys columns that need to be added in addition to
-            the default columns and as values the datatype. The defaults columns
-            are spectrum_id, peaks, intensities and metadata. The additional
-            columns should be the same names that are in the metadata dictionary,
-            since these values will be automatically added in the function
-            add_list_of_spectra_to_sqlite.
-            Default = None results in the default columns.
-        progress_bars:
-            If progress_bars is True progress bars will be shown for the different
-            parts of the progress.
-        """
-        sqlite_file_name = return_non_existing_file_name(sqlite_file_name)
-        additional_inchikey_columns = []
-        if compound_classes is not None:
-            additional_inchikey_columns = list(compound_classes.columns)
-            assert compound_classes.index.name == "inchikey", "Expected a pandas dataframe with inchikey as index name"
-        initialize_tables(sqlite_file_name, additional_metadata_columns_dict=columns_dict,
-                          additional_inchikey_columns=additional_inchikey_columns)
-        fill_spectrum_data_table(sqlite_file_name, list_of_spectra, progress_bar=progress_bars)
-        fill_inchikeys_table(sqlite_file_name, list_of_spectra,
-                             compound_classes=compound_classes,
-                             progress_bars=progress_bars)
+    def add_dataframe_to_sqlite(sqlite_file_name,
+                                table_name,
+                                dataframe: pd.DataFrame):
+        conn = sqlite3.connect(sqlite_file_name)
+        dataframe.to_sql(table_name, conn, if_exists='fail', index=True, index_label="spectrumid")
+        conn.commit()
+        conn.close()
     def initialize_tables(sqlite_file_name: str,
@@ Expand Down @@

ms2query/create_new_library/library_files_creator.py

            
                      Original file line number
                      Diff line number
                      Diff line change
                  
    @@ -16,10 +16,11 @@
  
    from spec2vec.vector_operations import calc_vector

    from tqdm import tqdm

    from ms2query.clean_and_filter_spectra import create_spectrum_documents

    from ms2query.create_new_library.add_classifire_classifications import (

        convert_to_dataframe, select_compound_classes)

    from ms2query.create_new_library.create_sqlite_database import \

        make_sqlfile_wrapper

    from ms2query.create_new_library.add_classifire_classifications import \

        select_compound_classes

    from ms2query.create_new_library.create_sqlite_database import (

        add_dataframe_to_sqlite, fill_inchikeys_table, fill_spectrum_data_table,

        initialize_tables)

    class LibraryFilesCreator:

    @@ -47,10 +48,10 @@ class LibraryFilesCreator:
  
        """

        def __init__(self,

                     library_spectra: List[Spectrum],

                     output_directory: Union[str, Path],

                     sqlite_file_name: Union[str, Path],

                     s2v_model_file_name: str = None,

                     ms2ds_model_file_name: str = None,

                     add_compound_classes: bool = True

                     compound_classes: Union[bool, pd.DataFrame, None] = True

                     ):

            """Creates files needed to run queries on a library

    @@ -70,108 +71,133 @@ def __init__(self,
  
                File name of a ms2ds model

            """

            # pylint: disable=too-many-arguments

            self.progress_bars = True

            self.output_directory = output_directory

            if not os.path.exists(self.output_directory):

                os.mkdir(self.output_directory)

            self.sqlite_file_name = os.path.join(output_directory, "ms2query_library.sqlite")

            self.ms2ds_embeddings_file_name = os.path.join(output_directory, "ms2ds_embeddings.pickle")

            self.s2v_embeddings_file_name = os.path.join(output_directory, "s2v_embeddings.pickle")

            # These checks are performed at the start, since the filtering of spectra can take long

            self._check_for_existing_files()

            if os.path.exists(sqlite_file_name):

                raise FileExistsError("The sqlite file already exists")

            self.sqlite_file_name = sqlite_file_name

            # Load in spec2vec model

            if s2v_model_file_name is None:

                self.s2v_model = None

            else:

                assert os.path.exists(s2v_model_file_name), "Spec2Vec model file does not exists"

            if os.path.exists(s2v_model_file_name):

                self.s2v_model = Word2Vec.load(s2v_model_file_name)

            # load in ms2ds model

            if ms2ds_model_file_name is None:

                self.ms2ds_model = None

            else:

                assert os.path.exists(ms2ds_model_file_name), "MS2Deepscore model file does not exists"

                raise FileNotFoundError("Spec2Vec model file does not exists")

            # load in ms2ds model

            if os.path.exists(ms2ds_model_file_name):

                self.ms2ds_model = load_ms2ds_model(ms2ds_model_file_name)

            else:

                raise FileNotFoundError("MS2Deepscore model file does not exists")

            # Initialise spectra

            self.list_of_spectra = library_spectra

            # Run default filters

            self.list_of_spectra = [msfilters.default_filters(s) for s in tqdm(self.list_of_spectra,

                                                                               desc="Applying default filters to spectra")]

            self.add_compound_classes = add_compound_classes

        def _check_for_existing_files(self):

            assert not os.path.exists(self.sqlite_file_name), \

                f"The file {self.sqlite_file_name} already exists," \

                f" choose a different output_base_filename"

            assert not os.path.exists(self.ms2ds_embeddings_file_name), \

                f"The file {self.ms2ds_embeddings_file_name} " \

                f"already exists, choose a different output_base_filename"

            assert not os.path.exists(self.s2v_embeddings_file_name), \

                f"The file {self.s2v_embeddings_file_name} " \

                f"already exists, choose a different output_base_filename"

        def create_all_library_files(self):

            """Creates files with embeddings and a sqlite file with spectra data

            """

            self.create_sqlite_file()

            self.store_s2v_embeddings()

            self.store_ms2ds_embeddings()

            self.compound_classes = self.add_compound_classes(compound_classes)

            if self.compound_classes is not None:

                self.additional_inchikey_columns = list(compound_classes.columns)

            else:

                self.additional_inchikey_columns = []

        def create_sqlite_file(self):

            if self.add_compound_classes:

            self.progress_bars = True

            self.additional_metadata_columns = {"precursor_mz": "REAL"}

        def add_compound_classes(self,

                                 compound_classes: Union[pd.DataFrame, bool, None]):

            """Calculates compound classes if True, otherwise uses given compound_classes

            """

            if compound_classes is True:

                compound_classes = select_compound_classes(self.list_of_spectra)

                compound_classes_df = convert_to_dataframe(compound_classes)

            elif compound_classes is not None and isinstance(compound_classes, pd.DataFrame):

                if not compound_classes.index.name == "inchikey":

                    raise ValueError("Expected a pandas dataframe with inchikey as index name")

            elif compound_classes is False or compound_classes is None:

                compound_classes = None

            else:

                compound_classes_df = None

            make_sqlfile_wrapper(

                self.sqlite_file_name,

                self.list_of_spectra,

                columns_dict={"precursor_mz": "REAL"},

                compound_classes=compound_classes_df,

                progress_bars=self.progress_bars,

            )

        def store_ms2ds_embeddings(self):

            """Creates a pickled file with embeddings scores for spectra

            A dataframe with as index randomly generated spectrum indexes and as columns the indexes

            of the vector is converted to pickle.

            """

            assert not os.path.exists(self.ms2ds_embeddings_file_name), \

                "Given ms2ds_embeddings_file_name already exists"

            assert self.ms2ds_model is not None, "No MS2deepscore model was provided"

            ms2ds = MS2DeepScore(self.ms2ds_model,

                                 progress_bar=self.progress_bars)

            # Compute spectral embeddings

            embeddings = ms2ds.calculate_vectors(self.list_of_spectra)

            spectrum_ids = np.arange(0, len(self.list_of_spectra))

            all_embeddings_df = pd.DataFrame(embeddings, index=spectrum_ids)

            all_embeddings_df.to_pickle(self.ms2ds_embeddings_file_name)

        def store_s2v_embeddings(self):

            """Creates and stored a dataframe with embeddings as pickled file

            A dataframe with as index randomly generated spectrum indexes and as columns the indexes

            of the vector is converted to pickle.

                raise ValueError("Expected a dataframe or True or None for compound classes")

            return compound_classes

        def create_sqlite_file(self):

            """Wrapper to create sqlite file containing spectrum information needed for MS2Query

            Args:

            -------

            sqlite_file_name:

                Name of sqlite_file that should be created, if it already exists the

                tables are added. If the tables in this sqlite file already exist, they

                will be overwritten.

            list_of_spectra:

                A list with spectrum objects

            columns_dict:

                Dictionary with as keys columns that need to be added in addition to

                the default columns and as values the datatype. The defaults columns

                are spectrum_id, peaks, intensities and metadata. The additional

                columns should be the same names that are in the metadata dictionary,

                since these values will be automatically added in the function

                add_list_of_spectra_to_sqlite.

                Default = None results in the default columns.

            progress_bars:

                If progress_bars is True progress bars will be shown for the different

                parts of the progress.

            """

            assert not os.path.exists(self.s2v_embeddings_file_name), \

                "Given s2v_embeddings_file_name already exists"

            assert self.s2v_model is not None, "No spec2vec model was specified"

            # Convert Spectrum objects to SpectrumDocument

            spectrum_documents = create_spectrum_documents(

                self.list_of_spectra,

                progress_bar=self.progress_bars)

            embeddings_dict = {}

            for spectrum_id, spectrum_document in tqdm(enumerate(spectrum_documents),

                                                       desc="Calculating embeddings",

                                                       disable=not self.progress_bars):

                embedding = calc_vector(self.s2v_model,

                                        spectrum_document,

                                        allowed_missing_percentage=100)

                embeddings_dict[spectrum_id] = embedding

            # Convert to pandas Dataframe

            embeddings_dataframe = pd.DataFrame.from_dict(embeddings_dict,

                                                          orient="index")

            embeddings_dataframe.to_pickle(self.s2v_embeddings_file_name)

            if os.path.exists(self.sqlite_file_name):

                raise FileExistsError("The sqlite file already exists")

            initialize_tables(self.sqlite_file_name,

                              additional_metadata_columns_dict=self.additional_metadata_columns,

                              additional_inchikey_columns=self.additional_inchikey_columns)

            fill_spectrum_data_table(self.sqlite_file_name, self.list_of_spectra, progress_bar=self.progress_bars)

            fill_inchikeys_table(self.sqlite_file_name, self.list_of_spectra,

                                 compound_classes=self.compound_classes,

                                 progress_bars=self.progress_bars)

            add_dataframe_to_sqlite(self.sqlite_file_name,

                                    'MS2Deepscore_embeddings',

                                    create_ms2ds_embeddings(self.ms2ds_model, self.list_of_spectra, self.progress_bars), )

            add_dataframe_to_sqlite(self.sqlite_file_name,

                                    'Spec2Vec_embeddings',

                                    create_s2v_embeddings(self.s2v_model, self.list_of_spectra, self.progress_bars))

    def create_ms2ds_embeddings(ms2ds_model,

                                list_of_spectra,

                                progress_bar=True):

        """Creates the ms2deepscore embeddings for all spectra

        A dataframe with as index randomly generated spectrum indexes and as columns the indexes

        of the vector is converted to pickle.

        """

        assert ms2ds_model is not None, "No MS2deepscore model was provided"

        ms2ds = MS2DeepScore(ms2ds_model,

                             progress_bar=progress_bar)

        # Compute spectral embeddings

        embeddings = ms2ds.calculate_vectors(list_of_spectra)

        spectrum_ids = np.arange(0, len(list_of_spectra))

        all_embeddings_df = pd.DataFrame(embeddings, index=spectrum_ids)

        return all_embeddings_df

    def create_s2v_embeddings(s2v_model,

                              list_of_spectra,

                              progress_bar=True):

        """Creates and stored a dataframe with embeddings as pickled file

        A dataframe with as index randomly generated spectrum indexes and as columns the indexes

        of the vector is converted to pickle.

        """

        assert s2v_model is not None, "No spec2vec model was specified"

        # Convert Spectrum objects to SpectrumDocument

        spectrum_documents = create_spectrum_documents(

            list_of_spectra,

            progress_bar=progress_bar)

        embeddings_dict = {}

        for spectrum_id, spectrum_document in tqdm(enumerate(spectrum_documents),

                                                   desc="Calculating embeddings",

                                                   disable=not progress_bar):

            embedding = calc_vector(s2v_model,

                                    spectrum_document,

                                    allowed_missing_percentage=100)

            embeddings_dict[spectrum_id] = embedding

        # Convert to pandas Dataframe

        embeddings_dataframe = pd.DataFrame.from_dict(embeddings_dict,

                                                      orient="index")

        return embeddings_dataframe

ms2query/create_new_library/train_models.py

            
                      Original file line number
                      Diff line number
                      Diff line change
                  
    @@ -47,6 +47,7 @@ def train_all_models(annotated_training_spectra,
  
        spec2vec_model_file_name = os.path.join(output_folder, "spec2vec_model.model")

        ms2query_model_file_name = os.path.join(output_folder, "ms2query_model.onnx")

        ms2ds_history_figure_file_name = os.path.join(output_folder, "ms2deepscore_training_history.svg")

        sqlite_model_file = os.path.join(output_folder, "ms2query_model.sqlite")

        # Train MS2Deepscore model

        train_ms2deepscore_wrapper(annotated_training_spectra,

    @@ -75,11 +76,11 @@ def train_all_models(annotated_training_spectra,
  
        # Create library with all training spectra

        library_files_creator = LibraryFilesCreator(annotated_training_spectra,

                                                    output_folder,

                                                    sqlite_model_file,

                                                    spec2vec_model_file_name,

                                                    ms2deepscore_model_file_name,

                                                    add_compound_classes=settings.add_compound_classes)

        library_files_creator.create_all_library_files()

                                                    compound_classes=settings.add_compound_classes)

        library_files_creator.create_sqlite_file()

    def clean_and_train_models(spectrum_file: str,

ms2query/create_new_library/train_ms2query_model.py

-Original file line number
+Diff line change
@@ Expand Up / @@ -116,6 +116,8 @@ def train_ms2query_model(training_spectra, @@
                              ms2ds_model_file_name,
                              s2v_model_file_name,
                              fraction_for_training):
+        os.makedirs(library_files_folder, exist_ok=True)
         # Select spectra belonging to a single InChIKey
         library_spectra, unique_inchikey_query_spectra = split_spectra_on_inchikeys(training_spectra,
                                                                                     fraction_for_training)
@@ Expand All / @@ -125,17 +127,17 @@ def train_ms2query_model(training_spectra, @@
         query_spectra_for_training = unique_inchikey_query_spectra + single_spectra_query_spectra
         # Create library files for training ms2query
-        library_creator_for_training = LibraryFilesCreator(library_spectra, output_directory=library_files_folder,
-                                                           s2v_model_file_name=s2v_model_file_name,
-                                                           ms2ds_model_file_name=ms2ds_model_file_name,
-                                                           add_compound_classes=False)
-        library_creator_for_training.create_all_library_files()
+        library_creator_for_training = LibraryFilesCreator(
+            library_spectra,
+            sqlite_file_name=os.path.join(library_files_folder, "ms2query_library.sqlite"),
+            s2v_model_file_name=s2v_model_file_name,
+            ms2ds_model_file_name=ms2ds_model_file_name,
+            compound_classes=None)
+        library_creator_for_training.create_sqlite_file()
         ms2library_for_training = MS2Library(sqlite_file_name=library_creator_for_training.sqlite_file_name,
                                              s2v_model_file_name=s2v_model_file_name,
                                              ms2ds_model_file_name=ms2ds_model_file_name,
-                                             pickled_s2v_embeddings_file_name=library_creator_for_training.s2v_embeddings_file_name,
-                                             pickled_ms2ds_embeddings_file_name=library_creator_for_training.ms2ds_embeddings_file_name,
                                              ms2query_model_file_name=None)
         # Create training data MS2Query model
         collector = DataCollectorForTraining(ms2library_for_training)
@@ Expand Down @@

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Add embeddings to sqlite #228

Uh oh!

Diff view

Diff view

There are no files selected for viewing

Uh oh!

Add embeddings to sqlite #228

Are you sure you want to change the base?

Uh oh!

Add embeddings to sqlite #228

Uh oh!

Uh oh!

Diff view

Diff view

There are no files selected for viewing

Uh oh!