Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -201,7 +201,7 @@ library_creator = LibraryFilesCreator(cleaned_library_spectra,
output_directory=directory_for_library_and_models,
ms2ds_model_file_name=ms2ds_model_file_name,
s2v_model_file_name=s2v_model_file_name, )
library_creator.create_all_library_files()
library_creator.create_sqlite_file()
```

To run MS2Query on your own created library. Check out the instructions under Run MS2Query. Both command line and the code version should work.
Expand Down
6 changes: 4 additions & 2 deletions ms2query/create_new_library/add_classifire_classifications.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,10 +127,12 @@ def select_compound_classes(spectra):
if npc_results is None:
print(f"no npc annotation was found for inchikey {inchikey14}")
inchikey_results_list[i] += ["", "", "", ""]
return inchikey_results_list
compound_classes_df = _convert_to_dataframe(inchikey_results_list)
assert compound_classes_df.index.name == "inchikey", "Expected a pandas dataframe with inchikey as index name"
return compound_classes_df


def convert_to_dataframe(inchikey_results_lists)->pd.DataFrame:
def _convert_to_dataframe(inchikey_results_lists)->pd.DataFrame:
header_list = [
'inchikey', 'cf_kingdom',
'cf_superclass', 'cf_class', 'cf_subclass', 'cf_direct_parent',
Expand Down
48 changes: 7 additions & 41 deletions ms2query/create_new_library/create_sqlite_database.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,49 +10,15 @@
from tqdm import tqdm
from ms2query.create_new_library.calculate_tanimoto_scores import \
calculate_highest_tanimoto_score
from ms2query.utils import return_non_existing_file_name


def make_sqlfile_wrapper(sqlite_file_name: str,
list_of_spectra: List[Spectrum],
columns_dict: Dict[str, str] = None,
compound_classes: pd.DataFrame = None,
progress_bars: bool = True):
"""Wrapper to create sqlite file containing spectrum information needed for MS2Query

Args:
-------
sqlite_file_name:
Name of sqlite_file that should be created, if it already exists the
tables are added. If the tables in this sqlite file already exist, they
will be overwritten.
list_of_spectra:
A list with spectrum objects
columns_dict:
Dictionary with as keys columns that need to be added in addition to
the default columns and as values the datatype. The defaults columns
are spectrum_id, peaks, intensities and metadata. The additional
columns should be the same names that are in the metadata dictionary,
since these values will be automatically added in the function
add_list_of_spectra_to_sqlite.
Default = None results in the default columns.
progress_bars:
If progress_bars is True progress bars will be shown for the different
parts of the progress.
"""
sqlite_file_name = return_non_existing_file_name(sqlite_file_name)
additional_inchikey_columns = []
if compound_classes is not None:
additional_inchikey_columns = list(compound_classes.columns)
assert compound_classes.index.name == "inchikey", "Expected a pandas dataframe with inchikey as index name"

initialize_tables(sqlite_file_name, additional_metadata_columns_dict=columns_dict,
additional_inchikey_columns=additional_inchikey_columns)
fill_spectrum_data_table(sqlite_file_name, list_of_spectra, progress_bar=progress_bars)

fill_inchikeys_table(sqlite_file_name, list_of_spectra,
compound_classes=compound_classes,
progress_bars=progress_bars)
def add_dataframe_to_sqlite(sqlite_file_name,
table_name,
dataframe: pd.DataFrame):
conn = sqlite3.connect(sqlite_file_name)
dataframe.to_sql(table_name, conn, if_exists='fail', index=True, index_label="spectrumid")
conn.commit()
conn.close()


def initialize_tables(sqlite_file_name: str,
Expand Down
220 changes: 123 additions & 97 deletions ms2query/create_new_library/library_files_creator.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,11 @@
from spec2vec.vector_operations import calc_vector
from tqdm import tqdm
from ms2query.clean_and_filter_spectra import create_spectrum_documents
from ms2query.create_new_library.add_classifire_classifications import (
convert_to_dataframe, select_compound_classes)
from ms2query.create_new_library.create_sqlite_database import \
make_sqlfile_wrapper
from ms2query.create_new_library.add_classifire_classifications import \
select_compound_classes
from ms2query.create_new_library.create_sqlite_database import (
add_dataframe_to_sqlite, fill_inchikeys_table, fill_spectrum_data_table,
initialize_tables)


class LibraryFilesCreator:
Expand Down Expand Up @@ -47,10 +48,10 @@ class LibraryFilesCreator:
"""
def __init__(self,
library_spectra: List[Spectrum],
output_directory: Union[str, Path],
sqlite_file_name: Union[str, Path],
s2v_model_file_name: str = None,
ms2ds_model_file_name: str = None,
add_compound_classes: bool = True
compound_classes: Union[bool, pd.DataFrame, None] = True
):
"""Creates files needed to run queries on a library

Expand All @@ -70,108 +71,133 @@ def __init__(self,
File name of a ms2ds model
"""
# pylint: disable=too-many-arguments
self.progress_bars = True
self.output_directory = output_directory
if not os.path.exists(self.output_directory):
os.mkdir(self.output_directory)
self.sqlite_file_name = os.path.join(output_directory, "ms2query_library.sqlite")
self.ms2ds_embeddings_file_name = os.path.join(output_directory, "ms2ds_embeddings.pickle")
self.s2v_embeddings_file_name = os.path.join(output_directory, "s2v_embeddings.pickle")
# These checks are performed at the start, since the filtering of spectra can take long
self._check_for_existing_files()
if os.path.exists(sqlite_file_name):
raise FileExistsError("The sqlite file already exists")
self.sqlite_file_name = sqlite_file_name

# Load in spec2vec model
if s2v_model_file_name is None:
self.s2v_model = None
else:
assert os.path.exists(s2v_model_file_name), "Spec2Vec model file does not exists"
if os.path.exists(s2v_model_file_name):
self.s2v_model = Word2Vec.load(s2v_model_file_name)
# load in ms2ds model
if ms2ds_model_file_name is None:
self.ms2ds_model = None
else:
assert os.path.exists(ms2ds_model_file_name), "MS2Deepscore model file does not exists"
raise FileNotFoundError("Spec2Vec model file does not exists")
# load in ms2ds model
if os.path.exists(ms2ds_model_file_name):
self.ms2ds_model = load_ms2ds_model(ms2ds_model_file_name)
else:
raise FileNotFoundError("MS2Deepscore model file does not exists")
# Initialise spectra
self.list_of_spectra = library_spectra

# Run default filters
self.list_of_spectra = [msfilters.default_filters(s) for s in tqdm(self.list_of_spectra,
desc="Applying default filters to spectra")]
self.add_compound_classes = add_compound_classes

def _check_for_existing_files(self):
assert not os.path.exists(self.sqlite_file_name), \
f"The file {self.sqlite_file_name} already exists," \
f" choose a different output_base_filename"
assert not os.path.exists(self.ms2ds_embeddings_file_name), \
f"The file {self.ms2ds_embeddings_file_name} " \
f"already exists, choose a different output_base_filename"
assert not os.path.exists(self.s2v_embeddings_file_name), \
f"The file {self.s2v_embeddings_file_name} " \
f"already exists, choose a different output_base_filename"

def create_all_library_files(self):
"""Creates files with embeddings and a sqlite file with spectra data
"""
self.create_sqlite_file()
self.store_s2v_embeddings()
self.store_ms2ds_embeddings()
self.compound_classes = self.add_compound_classes(compound_classes)
if self.compound_classes is not None:
self.additional_inchikey_columns = list(compound_classes.columns)
else:
self.additional_inchikey_columns = []

def create_sqlite_file(self):
if self.add_compound_classes:
self.progress_bars = True
self.additional_metadata_columns = {"precursor_mz": "REAL"}

def add_compound_classes(self,
compound_classes: Union[pd.DataFrame, bool, None]):
"""Calculates compound classes if True, otherwise uses given compound_classes
"""
if compound_classes is True:
compound_classes = select_compound_classes(self.list_of_spectra)
compound_classes_df = convert_to_dataframe(compound_classes)
elif compound_classes is not None and isinstance(compound_classes, pd.DataFrame):
if not compound_classes.index.name == "inchikey":
raise ValueError("Expected a pandas dataframe with inchikey as index name")
elif compound_classes is False or compound_classes is None:
compound_classes = None
else:
compound_classes_df = None
make_sqlfile_wrapper(
self.sqlite_file_name,
self.list_of_spectra,
columns_dict={"precursor_mz": "REAL"},
compound_classes=compound_classes_df,
progress_bars=self.progress_bars,
)

def store_ms2ds_embeddings(self):
"""Creates a pickled file with embeddings scores for spectra

A dataframe with as index randomly generated spectrum indexes and as columns the indexes
of the vector is converted to pickle.
"""
assert not os.path.exists(self.ms2ds_embeddings_file_name), \
"Given ms2ds_embeddings_file_name already exists"
assert self.ms2ds_model is not None, "No MS2deepscore model was provided"
ms2ds = MS2DeepScore(self.ms2ds_model,
progress_bar=self.progress_bars)

# Compute spectral embeddings
embeddings = ms2ds.calculate_vectors(self.list_of_spectra)
spectrum_ids = np.arange(0, len(self.list_of_spectra))
all_embeddings_df = pd.DataFrame(embeddings, index=spectrum_ids)
all_embeddings_df.to_pickle(self.ms2ds_embeddings_file_name)

def store_s2v_embeddings(self):
"""Creates and stored a dataframe with embeddings as pickled file

A dataframe with as index randomly generated spectrum indexes and as columns the indexes
of the vector is converted to pickle.
raise ValueError("Expected a dataframe or True or None for compound classes")
return compound_classes

def create_sqlite_file(self):
"""Wrapper to create sqlite file containing spectrum information needed for MS2Query

Args:
-------
sqlite_file_name:
Name of sqlite_file that should be created, if it already exists the
tables are added. If the tables in this sqlite file already exist, they
will be overwritten.
list_of_spectra:
A list with spectrum objects
columns_dict:
Dictionary with as keys columns that need to be added in addition to
the default columns and as values the datatype. The defaults columns
are spectrum_id, peaks, intensities and metadata. The additional
columns should be the same names that are in the metadata dictionary,
since these values will be automatically added in the function
add_list_of_spectra_to_sqlite.
Default = None results in the default columns.
progress_bars:
If progress_bars is True progress bars will be shown for the different
parts of the progress.
"""
assert not os.path.exists(self.s2v_embeddings_file_name), \
"Given s2v_embeddings_file_name already exists"
assert self.s2v_model is not None, "No spec2vec model was specified"
# Convert Spectrum objects to SpectrumDocument
spectrum_documents = create_spectrum_documents(
self.list_of_spectra,
progress_bar=self.progress_bars)
embeddings_dict = {}
for spectrum_id, spectrum_document in tqdm(enumerate(spectrum_documents),
desc="Calculating embeddings",
disable=not self.progress_bars):
embedding = calc_vector(self.s2v_model,
spectrum_document,
allowed_missing_percentage=100)
embeddings_dict[spectrum_id] = embedding

# Convert to pandas Dataframe
embeddings_dataframe = pd.DataFrame.from_dict(embeddings_dict,
orient="index")
embeddings_dataframe.to_pickle(self.s2v_embeddings_file_name)
if os.path.exists(self.sqlite_file_name):
raise FileExistsError("The sqlite file already exists")
initialize_tables(self.sqlite_file_name,
additional_metadata_columns_dict=self.additional_metadata_columns,
additional_inchikey_columns=self.additional_inchikey_columns)
fill_spectrum_data_table(self.sqlite_file_name, self.list_of_spectra, progress_bar=self.progress_bars)

fill_inchikeys_table(self.sqlite_file_name, self.list_of_spectra,
compound_classes=self.compound_classes,
progress_bars=self.progress_bars)

add_dataframe_to_sqlite(self.sqlite_file_name,
'MS2Deepscore_embeddings',
create_ms2ds_embeddings(self.ms2ds_model, self.list_of_spectra, self.progress_bars), )
add_dataframe_to_sqlite(self.sqlite_file_name,
'Spec2Vec_embeddings',
create_s2v_embeddings(self.s2v_model, self.list_of_spectra, self.progress_bars))


def create_ms2ds_embeddings(ms2ds_model,
list_of_spectra,
progress_bar=True):
"""Creates the ms2deepscore embeddings for all spectra

A dataframe with as index randomly generated spectrum indexes and as columns the indexes
of the vector is converted to pickle.
"""
assert ms2ds_model is not None, "No MS2deepscore model was provided"
ms2ds = MS2DeepScore(ms2ds_model,
progress_bar=progress_bar)
# Compute spectral embeddings
embeddings = ms2ds.calculate_vectors(list_of_spectra)
spectrum_ids = np.arange(0, len(list_of_spectra))
all_embeddings_df = pd.DataFrame(embeddings, index=spectrum_ids)
return all_embeddings_df


def create_s2v_embeddings(s2v_model,
list_of_spectra,
progress_bar=True):
"""Creates and stored a dataframe with embeddings as pickled file

A dataframe with as index randomly generated spectrum indexes and as columns the indexes
of the vector is converted to pickle.
"""
assert s2v_model is not None, "No spec2vec model was specified"
# Convert Spectrum objects to SpectrumDocument
spectrum_documents = create_spectrum_documents(
list_of_spectra,
progress_bar=progress_bar)
embeddings_dict = {}
for spectrum_id, spectrum_document in tqdm(enumerate(spectrum_documents),
desc="Calculating embeddings",
disable=not progress_bar):
embedding = calc_vector(s2v_model,
spectrum_document,
allowed_missing_percentage=100)
embeddings_dict[spectrum_id] = embedding

# Convert to pandas Dataframe
embeddings_dataframe = pd.DataFrame.from_dict(embeddings_dict,
orient="index")
return embeddings_dataframe
7 changes: 4 additions & 3 deletions ms2query/create_new_library/train_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ def train_all_models(annotated_training_spectra,
spec2vec_model_file_name = os.path.join(output_folder, "spec2vec_model.model")
ms2query_model_file_name = os.path.join(output_folder, "ms2query_model.onnx")
ms2ds_history_figure_file_name = os.path.join(output_folder, "ms2deepscore_training_history.svg")
sqlite_model_file = os.path.join(output_folder, "ms2query_model.sqlite")

# Train MS2Deepscore model
train_ms2deepscore_wrapper(annotated_training_spectra,
Expand Down Expand Up @@ -75,11 +76,11 @@ def train_all_models(annotated_training_spectra,

# Create library with all training spectra
library_files_creator = LibraryFilesCreator(annotated_training_spectra,
output_folder,
sqlite_model_file,
spec2vec_model_file_name,
ms2deepscore_model_file_name,
add_compound_classes=settings.add_compound_classes)
library_files_creator.create_all_library_files()
compound_classes=settings.add_compound_classes)
library_files_creator.create_sqlite_file()


def clean_and_train_models(spectrum_file: str,
Expand Down
16 changes: 9 additions & 7 deletions ms2query/create_new_library/train_ms2query_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,8 @@ def train_ms2query_model(training_spectra,
ms2ds_model_file_name,
s2v_model_file_name,
fraction_for_training):
os.makedirs(library_files_folder, exist_ok=True)

# Select spectra belonging to a single InChIKey
library_spectra, unique_inchikey_query_spectra = split_spectra_on_inchikeys(training_spectra,
fraction_for_training)
Expand All @@ -125,17 +127,17 @@ def train_ms2query_model(training_spectra,
query_spectra_for_training = unique_inchikey_query_spectra + single_spectra_query_spectra

# Create library files for training ms2query
library_creator_for_training = LibraryFilesCreator(library_spectra, output_directory=library_files_folder,
s2v_model_file_name=s2v_model_file_name,
ms2ds_model_file_name=ms2ds_model_file_name,
add_compound_classes=False)
library_creator_for_training.create_all_library_files()
library_creator_for_training = LibraryFilesCreator(
library_spectra,
sqlite_file_name=os.path.join(library_files_folder, "ms2query_library.sqlite"),
s2v_model_file_name=s2v_model_file_name,
ms2ds_model_file_name=ms2ds_model_file_name,
compound_classes=None)
library_creator_for_training.create_sqlite_file()

ms2library_for_training = MS2Library(sqlite_file_name=library_creator_for_training.sqlite_file_name,
s2v_model_file_name=s2v_model_file_name,
ms2ds_model_file_name=ms2ds_model_file_name,
pickled_s2v_embeddings_file_name=library_creator_for_training.s2v_embeddings_file_name,
pickled_ms2ds_embeddings_file_name=library_creator_for_training.ms2ds_embeddings_file_name,
ms2query_model_file_name=None)
# Create training data MS2Query model
collector = DataCollectorForTraining(ms2library_for_training)
Expand Down
Loading