From dd7444dae1d4bb28e4e338c121e7fcc9e37b4de9 Mon Sep 17 00:00:00 2001 From: zshang Date: Tue, 3 Mar 2026 20:04:46 +0800 Subject: [PATCH 1/6] Remove unused features --- .../feature_generator/spectral_similarity.py | 27 ------------------- 1 file changed, 27 deletions(-) diff --git a/optimhc/feature_generator/spectral_similarity.py b/optimhc/feature_generator/spectral_similarity.py index aa69fba..125e4d2 100644 --- a/optimhc/feature_generator/spectral_similarity.py +++ b/optimhc/feature_generator/spectral_similarity.py @@ -902,29 +902,6 @@ def _calculate_predicted_counts( return predicted_seen_nonzero, predicted_not_seen - def _calculate_bray_curtis_similarity( - self, exp_vector: np.ndarray, pred_vector: np.ndarray - ) -> float: - """ - Calculate Bray-Curtis similarity between experimental and predicted vectors - - Parameters: - exp_vector (np.ndarray): Experimental intensity vector - pred_vector (np.ndarray): Predicted intensity vector - - Returns: - float: Bray-Curtis similarity (0-1, higher is better) - """ - # Bray-Curtis dissimilarity: sum(|exp - pred|) / sum(exp + pred) - numerator = np.sum(np.abs(exp_vector - pred_vector)) - denominator = np.sum(exp_vector + pred_vector) - - if denominator == 0: - return 0.0 - - # Convert dissimilarity to similarity: 1 - dissimilarity - return 1.0 - (numerator / denominator) - def _calculate_similarity_features( self, exp_vector: np.ndarray, pred_vector: np.ndarray ) -> Dict[str, float]: @@ -950,7 +927,6 @@ def _calculate_similarity_features( - unweighted_entropy_similarity: Spectral entropy similarity - predicted_seen_nonzero: Number of predicted peaks seen in experimental spectrum - predicted_not_seen: Number of predicted peaks not seen in experimental spectrum - - bray_curtis_similarity: Bray-Curtis similarity (0-1) """ spectral_angle_similarity = self._calculate_spectral_angle_similarity( exp_vector, pred_vector @@ -965,8 +941,6 @@ def _calculate_similarity_features( predicted_seen_nonzero, predicted_not_seen = self._calculate_predicted_counts( exp_vector, pred_vector ) - bray_curtis_similarity = self._calculate_bray_curtis_similarity(exp_vector, pred_vector) - return { "spectral_angle_similarity": spectral_angle_similarity, "cosine_similarity": cosine_similarity, @@ -976,7 +950,6 @@ def _calculate_similarity_features( "unweighted_entropy_similarity": unweighted_entropy_similarity, "predicted_seen_nonzero": predicted_seen_nonzero, "predicted_not_seen": predicted_not_seen, - "bray_curtis_similarity": bray_curtis_similarity, } def _generate_features(self) -> pd.DataFrame: From dbbaff227673a34d1bf867b3430126f71e0c58bc Mon Sep 17 00:00:00 2001 From: zshang Date: Wed, 4 Mar 2026 21:31:34 +0800 Subject: [PATCH 2/6] Refactor spectral similarity alignment, improve output structure, and add unit test --- .../feature_generator/spectral_similarity.py | 72 +++++------- tests/feature_generator/__init__.py | 0 .../test_spectral_similarity.py | 109 ++++++++++++++++++ 3 files changed, 140 insertions(+), 41 deletions(-) create mode 100644 tests/feature_generator/__init__.py create mode 100644 tests/feature_generator/test_spectral_similarity.py diff --git a/optimhc/feature_generator/spectral_similarity.py b/optimhc/feature_generator/spectral_similarity.py index 125e4d2..63d4337 100644 --- a/optimhc/feature_generator/spectral_similarity.py +++ b/optimhc/feature_generator/spectral_similarity.py @@ -6,7 +6,6 @@ import numpy as np import pandas as pd from koinapy import Koina -from scipy.spatial.distance import cosine from scipy.stats import pearsonr, spearmanr from optimhc import utils @@ -413,7 +412,7 @@ def _align_spectra_all_peaks( pred_intensity: List[float], pred_annotation: Optional[List[str]] = None, use_ppm: bool = True, - ) -> Tuple[np.ndarray, np.ndarray, List[Tuple], Dict]: + ) -> Tuple[np.ndarray, np.ndarray, np.ndarray, Dict]: """ Align experimental and predicted spectra. @@ -426,10 +425,11 @@ def _align_spectra_all_peaks( use_ppm (bool): Whether to use ppm tolerance or Da tolerance Returns: - Tuple[np.ndarray, np.ndarray, List[Tuple], Dict]: + Tuple[np.ndarray, np.ndarray, np.ndarray, Dict]: - Aligned experimental intensity vector - Predicted intensity vector - - Matching index pairs + - Matching index pairs as int array of shape (N, 2), + where column 0 is pred_idx and column 1 is exp_idx (-1 = no match) - Additional info including original sorted arrays """ # Sort both experimental and predicted spectra by m/z @@ -443,27 +443,31 @@ def _align_spectra_all_peaks( pred_mz_sorted, pred_intensity_sorted, pred_annotation_sorted = ( self._sort_spectrum_by_mz(pred_mz, pred_intensity) ) - aligned_exp_intensity = np.zeros(len(pred_mz_sorted)) + + n_pred = len(pred_mz_sorted) + aligned_exp_intensity = np.zeros(n_pred, dtype=np.float64) aligned_pred_intensity = pred_intensity_sorted.copy() - matched_indices = [] + matched_indices = np.empty((n_pred, 2), dtype=np.int64) + matched_indices[:, 0] = np.arange(n_pred) + matched_indices[:, 1] = -1 start_pos = 0 + n_exp = len(exp_mz_sorted) - for i, pred_peak_mz in enumerate(pred_mz_sorted): + for i in range(n_pred): + pred_peak_mz = pred_mz_sorted[i] if use_ppm: fragment_min = pred_peak_mz * (1 - self.tolerance_ppm / 1e6) fragment_max = pred_peak_mz * (1 + self.tolerance_ppm / 1e6) else: - # If using Da tolerance, use a fixed window (typically ~0.05 Da) - tolerance = 0.05 # Default value - fragment_min = pred_peak_mz - tolerance - fragment_max = pred_peak_mz + tolerance + # TODO: Make this a parameter + fragment_min = pred_peak_mz - 0.02 + fragment_max = pred_peak_mz + 0.02 - matched_int = 0 - matched_exp_idx = None + matched_int = 0.0 past_start = 0 - while start_pos + past_start < len(exp_mz_sorted): + while start_pos + past_start < n_exp: exp_peak_mz = exp_mz_sorted[start_pos + past_start] if exp_peak_mz < fragment_min: start_pos += 1 @@ -471,18 +475,13 @@ def _align_spectra_all_peaks( exp_peak_int = exp_intensity_sorted[start_pos + past_start] if exp_peak_int > matched_int: matched_int = exp_peak_int - matched_exp_idx = start_pos + past_start + matched_indices[i, 1] = start_pos + past_start past_start += 1 else: break aligned_exp_intensity[i] = matched_int - # Record matching index pairs (pred_idx, exp_idx) - pred_idx = i - exp_idx = matched_exp_idx - matched_indices.append((pred_idx, exp_idx)) - additional_info = { "exp_mz_sorted": exp_mz_sorted, "exp_intensity_sorted": exp_intensity_sorted, @@ -502,37 +501,30 @@ def _get_top_peaks_vectors( self, aligned_exp_intensity: np.ndarray, aligned_pred_intensity: np.ndarray, - matched_indices: List[Tuple], + matched_indices: np.ndarray, top_n: int, - ) -> Tuple[np.ndarray, np.ndarray, List[Tuple]]: + ) -> Tuple[np.ndarray, np.ndarray, np.ndarray]: """ Extract top N peaks based on predicted intensity for similarity calculation Parameters: aligned_exp_intensity (np.ndarray): Aligned experimental intensity vector aligned_pred_intensity (np.ndarray): Aligned predicted intensity vector - matched_indices (List[Tuple]): Matching index pairs (pred_idx, exp_idx) + matched_indices (np.ndarray): Matching index pairs, shape (N, 2) top_n (int): Number of top peaks to extract Returns: - Tuple[np.ndarray, np.ndarray, List[Tuple]]: + Tuple[np.ndarray, np.ndarray, np.ndarray]: - Top N experimental intensity vector - Top N predicted intensity vector - - Top N matching index pairs + - Top N matching index pairs, shape (top_n, 2) """ - # Get indices of top N peaks by predicted intensity num_peaks = min(top_n, len(aligned_pred_intensity)) top_pred_indices = np.argsort(-aligned_pred_intensity)[:num_peaks] - # Extract top N peaks - top_exp_intensity = np.zeros(num_peaks) - top_pred_intensity = np.zeros(num_peaks) - top_matched_indices = [] - - for i, orig_idx in enumerate(top_pred_indices): - top_exp_intensity[i] = aligned_exp_intensity[orig_idx] - top_pred_intensity[i] = aligned_pred_intensity[orig_idx] - top_matched_indices.append(matched_indices[orig_idx]) + top_exp_intensity = aligned_exp_intensity[top_pred_indices] + top_pred_intensity = aligned_pred_intensity[top_pred_indices] + top_matched_indices = matched_indices[top_pred_indices] return top_exp_intensity, top_pred_intensity, top_matched_indices @@ -543,7 +535,7 @@ def _align_spectra( pred_mz: List[float], pred_intensity: List[float], pred_annotation: Optional[List[str]] = None, - ) -> Tuple[np.ndarray, np.ndarray, List[Tuple]]: + ) -> Tuple[np.ndarray, np.ndarray, np.ndarray]: """ Align experimental and predicted spectra. @@ -565,10 +557,10 @@ def _align_spectra( Returns ------- - tuple of (np.ndarray, np.ndarray, list of tuple) + tuple of (np.ndarray, np.ndarray, np.ndarray) - Aligned experimental intensity vector - Predicted intensity vector - - Matching index pairs (for top N peaks) + - Matching index pairs as int array of shape (top_n, 2) Notes ----- @@ -703,12 +695,10 @@ def _calculate_cosine_similarity( if np.sum(exp_vector) == 0 or np.sum(pred_vector) == 0: return 0.0 - # Normalize vectors using L2 normalization exp_norm = self._normalize_vector_l2(exp_vector) pred_norm = self._normalize_vector_l2(pred_vector) - # Use 1 - cosine distance = cosine similarity - return 1.0 - cosine(exp_norm, pred_norm) + return float(np.dot(exp_norm, pred_norm)) def _calculate_spearman_correlation( self, exp_vector: np.ndarray, pred_vector: np.ndarray diff --git a/tests/feature_generator/__init__.py b/tests/feature_generator/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/feature_generator/test_spectral_similarity.py b/tests/feature_generator/test_spectral_similarity.py new file mode 100644 index 0000000..60fe8b1 --- /dev/null +++ b/tests/feature_generator/test_spectral_similarity.py @@ -0,0 +1,109 @@ +"""Tests for spectral alignment: exp/pred spectra -> two aligned intensity vectors.""" + +import numpy as np + +from optimhc.feature_generator.spectral_similarity import SpectralSimilarityFeatureGenerator + + +def _make_generator(tolerance_ppm=20.0, top_n=36): + gen = object.__new__(SpectralSimilarityFeatureGenerator) + gen.tolerance_ppm = tolerance_ppm + gen.top_n = top_n + return gen + + +PRED_MZ = [147.11, 234.15, 347.23, 462.26, 577.29, 690.37, 803.45, 916.54] +PRED_INT = [0.05, 0.30, 0.80, 1.00, 0.60, 0.35, 0.15, 0.02] + +EXP_MZ = [ + 130.07, + 147.112, + 234.153, + 300.10, + 347.234, + 462.265, + 520.33, + 577.30, + 690.38, + 750.40, + 803.46, + 950.00, +] +EXP_INT = [ + 500.0, + 1200.0, + 8000.0, + 300.0, + 15000.0, + 22000.0, + 400.0, + 12000.0, + 7500.0, + 200.0, + 3000.0, + 100.0, +] + + +class TestAlignSpectraAllPeaks: + def test_realistic_alignment(self): + gen = _make_generator(tolerance_ppm=20.0) + aligned_exp, aligned_pred, matched, info = gen._align_spectra_all_peaks( + EXP_MZ, + EXP_INT, + PRED_MZ, + PRED_INT, + ) + assert len(aligned_exp) == len(PRED_MZ) + assert len(aligned_pred) == len(PRED_MZ) + assert matched.shape == (len(PRED_MZ), 2) + + # pred 147.11 should match exp 147.112 + assert aligned_exp[0] == 1200.0 + # pred 462.26 should match exp 462.265 + assert aligned_exp[3] == 22000.0 + # pred 916.54 has no nearby exp peak -> 0 + assert aligned_exp[7] == 0.0 + # unmatched index should be -1 + assert matched[7, 1] == -1 + + +class TestGetTopPeaksVectors: + def test_top_n_from_aligned(self): + gen = _make_generator(tolerance_ppm=20.0) + aligned_exp, aligned_pred, matched, _ = gen._align_spectra_all_peaks( + EXP_MZ, + EXP_INT, + PRED_MZ, + PRED_INT, + ) + top_exp, top_pred, top_matched = gen._get_top_peaks_vectors( + aligned_exp, + aligned_pred, + matched, + top_n=3, + ) + assert len(top_exp) == 3 + assert top_matched.shape == (3, 2) + # top 3 by pred intensity: 462.26 (1.0), 347.23 (0.8), 577.29 (0.6) + np.testing.assert_array_equal(sorted(top_pred), sorted([1.0, 0.8, 0.6])) + + +class TestAlignPipeline: + def test_e2e_align_and_top_peaks(self): + gen = _make_generator(tolerance_ppm=20.0, top_n=5) + aligned_exp, aligned_pred, matched, _ = gen._align_spectra_all_peaks( + EXP_MZ, + EXP_INT, + PRED_MZ, + PRED_INT, + ) + top_exp, top_pred, _ = gen._get_top_peaks_vectors( + aligned_exp, + aligned_pred, + matched, + top_n=5, + ) + features = gen._calculate_similarity_features(top_exp, top_pred) + assert features["spectral_angle_similarity"] > 0.5 + assert features["predicted_seen_nonzero"] >= 4 From 697373da53b9d6f2b07cd40926fa6a79880793e9 Mon Sep 17 00:00:00 2001 From: zshang Date: Wed, 4 Mar 2026 22:05:45 +0800 Subject: [PATCH 3/6] Add numba-optimized peak alignment function and integrate it into spectral similarity calculations --- optimhc/feature_generator/numba_utils.py | 54 +++++++++++++++++++ .../feature_generator/spectral_similarity.py | 49 ++++------------- 2 files changed, 65 insertions(+), 38 deletions(-) create mode 100644 optimhc/feature_generator/numba_utils.py diff --git a/optimhc/feature_generator/numba_utils.py b/optimhc/feature_generator/numba_utils.py new file mode 100644 index 0000000..8848133 --- /dev/null +++ b/optimhc/feature_generator/numba_utils.py @@ -0,0 +1,54 @@ +import numba as nb +import numpy as np + + +@nb.njit(cache=True) +def align_peaks( + exp_mz_sorted: np.ndarray, + exp_intensity_sorted: np.ndarray, + pred_mz_sorted: np.ndarray, + tolerance_ppm: float, +): + """ + Align sorted experimental peaks to sorted predicted peaks using ppm tolerance. + + For each predicted peak, find the experimental peak within the tolerance window + that has the highest intensity. + + Returns + ------- + aligned_exp_intensity : float64 array of length n_pred + matched_exp_indices : int64 array of length n_pred (-1 = no match) + """ + n_pred = len(pred_mz_sorted) + n_exp = len(exp_mz_sorted) + + aligned_exp_intensity = np.zeros(n_pred, dtype=np.float64) + matched_exp_indices = np.full(n_pred, -1, dtype=np.int64) + + start_pos = 0 + + for i in range(n_pred): + pred_peak_mz = pred_mz_sorted[i] + fragment_min = pred_peak_mz * (1.0 - tolerance_ppm / 1e6) + fragment_max = pred_peak_mz * (1.0 + tolerance_ppm / 1e6) + + matched_int = 0.0 + past_start = 0 + + while start_pos + past_start < n_exp: + exp_peak_mz = exp_mz_sorted[start_pos + past_start] + if exp_peak_mz < fragment_min: + start_pos += 1 + elif exp_peak_mz <= fragment_max: + exp_peak_int = exp_intensity_sorted[start_pos + past_start] + if exp_peak_int > matched_int: + matched_int = exp_peak_int + matched_exp_indices[i] = start_pos + past_start + past_start += 1 + else: + break + + aligned_exp_intensity[i] = matched_int + + return aligned_exp_intensity, matched_exp_indices diff --git a/optimhc/feature_generator/spectral_similarity.py b/optimhc/feature_generator/spectral_similarity.py index 63d4337..7a6ebf8 100644 --- a/optimhc/feature_generator/spectral_similarity.py +++ b/optimhc/feature_generator/spectral_similarity.py @@ -10,6 +10,7 @@ from optimhc import utils from optimhc.feature_generator.base_feature_generator import BaseFeatureGenerator +from optimhc.feature_generator.numba_utils import align_peaks from optimhc.parser import extract_mzml_data logger = logging.getLogger(__name__) @@ -411,10 +412,9 @@ def _align_spectra_all_peaks( pred_mz: List[float], pred_intensity: List[float], pred_annotation: Optional[List[str]] = None, - use_ppm: bool = True, ) -> Tuple[np.ndarray, np.ndarray, np.ndarray, Dict]: """ - Align experimental and predicted spectra. + Align experimental and predicted spectra using ppm tolerance. Parameters: exp_mz (List[float]): Experimental m/z values @@ -422,7 +422,6 @@ def _align_spectra_all_peaks( pred_mz (List[float]): Predicted m/z values pred_intensity (List[float]): Predicted intensity values pred_annotation (Optional[List[str]]): Predicted fragment annotations - use_ppm (bool): Whether to use ppm tolerance or Da tolerance Returns: Tuple[np.ndarray, np.ndarray, np.ndarray, Dict]: @@ -445,42 +444,18 @@ def _align_spectra_all_peaks( ) n_pred = len(pred_mz_sorted) - aligned_exp_intensity = np.zeros(n_pred, dtype=np.float64) aligned_pred_intensity = pred_intensity_sorted.copy() - matched_indices = np.empty((n_pred, 2), dtype=np.int64) - matched_indices[:, 0] = np.arange(n_pred) - matched_indices[:, 1] = -1 - start_pos = 0 - n_exp = len(exp_mz_sorted) + aligned_exp_intensity, matched_exp_indices = align_peaks( + np.ascontiguousarray(exp_mz_sorted, dtype=np.float64), + np.ascontiguousarray(exp_intensity_sorted, dtype=np.float64), + np.ascontiguousarray(pred_mz_sorted, dtype=np.float64), + self.tolerance_ppm, + ) - for i in range(n_pred): - pred_peak_mz = pred_mz_sorted[i] - if use_ppm: - fragment_min = pred_peak_mz * (1 - self.tolerance_ppm / 1e6) - fragment_max = pred_peak_mz * (1 + self.tolerance_ppm / 1e6) - else: - # TODO: Make this a parameter - fragment_min = pred_peak_mz - 0.02 - fragment_max = pred_peak_mz + 0.02 - - matched_int = 0.0 - past_start = 0 - - while start_pos + past_start < n_exp: - exp_peak_mz = exp_mz_sorted[start_pos + past_start] - if exp_peak_mz < fragment_min: - start_pos += 1 - elif exp_peak_mz <= fragment_max: - exp_peak_int = exp_intensity_sorted[start_pos + past_start] - if exp_peak_int > matched_int: - matched_int = exp_peak_int - matched_indices[i, 1] = start_pos + past_start - past_start += 1 - else: - break - - aligned_exp_intensity[i] = matched_int + matched_indices = np.empty((n_pred, 2), dtype=np.int64) + matched_indices[:, 0] = np.arange(n_pred) + matched_indices[:, 1] = matched_exp_indices additional_info = { "exp_mz_sorted": exp_mz_sorted, @@ -576,7 +551,6 @@ def _align_spectra( pred_mz, pred_intensity, pred_annotation, - use_ppm=True, ) ) @@ -997,7 +971,6 @@ def _generate_features(self) -> pd.DataFrame: pred_mz, pred_intensity, pred_annotation, - use_ppm=True, ) # Extract top N peaks for similarity calculations From 9ad97d0012820737d051e8e6d8a1b826724e6eec Mon Sep 17 00:00:00 2001 From: zshang Date: Wed, 4 Mar 2026 22:08:44 +0800 Subject: [PATCH 4/6] Update dependencies in pyproject.toml and uv.lock to include numba --- pyproject.toml | 5 +++-- uv.lock | 2 ++ 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index d33d077..2faa19b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -18,7 +18,7 @@ dependencies = [ "lxml==5.3.0", "matplotlib==3.9.2", "mhctools @ git+https://github.com/openvax/mhctools.git@868ed09b4dfcab18aed563727d65bca3408476ea", - "setuptools<82,>=61", # mhctools uses pkg_resources at runtime + "setuptools<82,>=61", # mhctools uses pkg_resources at runtime "mhcflurry==2.1.4", "mokapot==0.10.0", "networkx==3.2.1", @@ -30,7 +30,8 @@ dependencies = [ "scipy==1.13.1", "seaborn==0.13.2", "tqdm==4.67.0", - "xgboost==1.7.6" + "xgboost==1.7.6", + "numba>=0.64.0", ] [project.optional-dependencies] diff --git a/uv.lock b/uv.lock index 812578c..599fb94 100644 --- a/uv.lock +++ b/uv.lock @@ -1352,6 +1352,7 @@ dependencies = [ { name = "mhctools" }, { name = "mokapot" }, { name = "networkx" }, + { name = "numba" }, { name = "numpy" }, { name = "pandas" }, { name = "pyteomics" }, @@ -1388,6 +1389,7 @@ requires-dist = [ { name = "mhctools", git = "https://github.com/openvax/mhctools.git?rev=868ed09b4dfcab18aed563727d65bca3408476ea" }, { name = "mokapot", specifier = "==0.10.0" }, { name = "networkx", specifier = "==3.2.1" }, + { name = "numba", specifier = ">=0.64.0" }, { name = "numpy", specifier = "==1.26.4" }, { name = "pandas", specifier = "==2.2.3" }, { name = "plotly", marker = "extra == 'gui'", specifier = ">=5.13.0" }, From 0b0f21229298f68a8e2fb683cd9c69b3b05783bd Mon Sep 17 00:00:00 2001 From: zshang Date: Wed, 4 Mar 2026 22:09:12 +0800 Subject: [PATCH 5/6] Fix regex pattern usage --- optimhc/core/feature_generation.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/optimhc/core/feature_generation.py b/optimhc/core/feature_generation.py index 393763f..d83fff6 100644 --- a/optimhc/core/feature_generation.py +++ b/optimhc/core/feature_generation.py @@ -280,11 +280,11 @@ def generate_features(psms, config): else: logger.info("MS data file information is not provided.") logger.info( - "Trying to use the default pattern: (.+?)\.\d+\.\d+\.\d+ to extract mzML file names from spectrum IDs." + r"Trying to use the default pattern: (.+?)\.\d+\.\d+\.\d+ to extract mzML file names from spectrum IDs." ) for spectrum_id in spectrum_ids: mz_file_names.append( - re.match("(.+?)\.\d+\.\d+\.\d+", spectrum_id).group(1) + re.match(r"(.+?)\.\d+\.\d+\.\d+", spectrum_id).group(1) ) mz_file_paths = [ From 828fdd7a20acdeff2b9e45a89f3fc832d0242d9f Mon Sep 17 00:00:00 2001 From: zshang Date: Wed, 4 Mar 2026 23:28:58 +0800 Subject: [PATCH 6/6] Add compute_similarity_features function for optimized similarity calculations, and remove deprecated methods --- optimhc/feature_generator/numba_utils.py | 157 ++++++ .../feature_generator/spectral_similarity.py | 513 ++++-------------- 2 files changed, 265 insertions(+), 405 deletions(-) diff --git a/optimhc/feature_generator/numba_utils.py b/optimhc/feature_generator/numba_utils.py index 8848133..d9333f2 100644 --- a/optimhc/feature_generator/numba_utils.py +++ b/optimhc/feature_generator/numba_utils.py @@ -52,3 +52,160 @@ def align_peaks( aligned_exp_intensity[i] = matched_int return aligned_exp_intensity, matched_exp_indices + + +@nb.njit(cache=True) +def _rankdata_average(arr): + n = len(arr) + sorter = np.argsort(arr) + ranks = np.empty(n, dtype=np.float64) + sorted_arr = arr[sorter] + i = 0 + while i < n: + j = i + 1 + while j < n and sorted_arr[j] == sorted_arr[i]: + j += 1 + avg = 0.5 * (i + 1 + j) + for k in range(i, j): + ranks[sorter[k]] = avg + i = j + return ranks + + +@nb.njit(cache=True) +def compute_similarity_features(exp_vector, pred_vector): + """ + Compute all similarity metrics between two aligned intensity vectors. + + Returns + ------- + tuple of 8 float64 values: + (spectral_angle_similarity, cosine_similarity, + pearson_correlation, spearman_correlation, + mean_squared_error, unweighted_entropy_similarity, + predicted_seen_nonzero, predicted_not_seen) + """ + n = len(exp_vector) + + exp_sq = 0.0 + pred_sq = 0.0 + dot = 0.0 + exp_sum = 0.0 + pred_sum = 0.0 + for i in range(n): + e = exp_vector[i] + p = pred_vector[i] + exp_sq += e * e + pred_sq += p * p + dot += e * p + exp_sum += e + pred_sum += p + + exp_l2 = np.sqrt(exp_sq) + pred_l2 = np.sqrt(pred_sq) + + # spectral angle similarity + if exp_l2 > 0.0 and pred_l2 > 0.0: + cos_val = dot / (exp_l2 * pred_l2) + cos_val = max(-1.0, min(1.0, cos_val)) + spectral_angle = 1.0 - 2.0 * np.arccos(cos_val) / np.pi + else: + spectral_angle = 0.0 + + # cosine similarity + if exp_sum == 0.0 or pred_sum == 0.0: + cosine_sim = 0.0 + elif exp_l2 > 0.0 and pred_l2 > 0.0: + cosine_sim = dot / (exp_l2 * pred_l2) + else: + cosine_sim = 0.0 + + # MSE on L2-normalized vectors + mse = 0.0 + for i in range(n): + e = exp_vector[i] / exp_l2 if exp_l2 > 0.0 else 0.0 + p = pred_vector[i] / pred_l2 if pred_l2 > 0.0 else 0.0 + diff = e - p + mse += diff * diff + mse /= n if n > 0 else 1.0 + + # pearson correlation + exp_mean = exp_sum / n if n > 0 else 0.0 + pred_mean = pred_sum / n if n > 0 else 0.0 + num_p = 0.0 + exp_var = 0.0 + pred_var = 0.0 + for i in range(n): + de = exp_vector[i] - exp_mean + dp = pred_vector[i] - pred_mean + num_p += de * dp + exp_var += de * de + pred_var += dp * dp + if exp_var > 0.0 and pred_var > 0.0: + pearson = num_p / np.sqrt(exp_var * pred_var) + else: + pearson = 0.0 + + # spearman correlation + if exp_var > 0.0 and pred_var > 0.0: + exp_ranks = _rankdata_average(exp_vector) + pred_ranks = _rankdata_average(pred_vector) + rmean_e = 0.0 + rmean_p = 0.0 + for i in range(n): + rmean_e += exp_ranks[i] + rmean_p += pred_ranks[i] + rmean_e /= n + rmean_p /= n + snum = 0.0 + svar_e = 0.0 + svar_p = 0.0 + for i in range(n): + de = exp_ranks[i] - rmean_e + dp = pred_ranks[i] - rmean_p + snum += de * dp + svar_e += de * de + svar_p += dp * dp + if svar_e > 0.0 and svar_p > 0.0: + spearman = snum / np.sqrt(svar_e * svar_p) + else: + spearman = 0.0 + else: + spearman = 0.0 + + # unweighted entropy similarity + s_exp = 0.0 + s_pred = 0.0 + s_mixed = 0.0 + for i in range(n): + ep = exp_vector[i] / exp_sum if exp_sum > 0.0 else 0.0 + pp = pred_vector[i] / pred_sum if pred_sum > 0.0 else 0.0 + if ep > 0.0: + s_exp -= ep * np.log(ep) + if pp > 0.0: + s_pred -= pp * np.log(pp) + mp = 0.5 * (ep + pp) + if mp > 0.0: + s_mixed -= mp * np.log(mp) + entropy_sim = 1.0 - (2.0 * s_mixed - s_exp - s_pred) / np.log(4.0) + + # predicted seen/not seen + seen = 0.0 + not_seen = 0.0 + for i in range(n): + if pred_vector[i] > 0.0: + if exp_vector[i] > 0.0: + seen += 1.0 + else: + not_seen += 1.0 + + return ( + spectral_angle, + cosine_sim, + pearson, + spearman, + mse, + entropy_sim, + seen, + not_seen, + ) diff --git a/optimhc/feature_generator/spectral_similarity.py b/optimhc/feature_generator/spectral_similarity.py index 7a6ebf8..698452f 100644 --- a/optimhc/feature_generator/spectral_similarity.py +++ b/optimhc/feature_generator/spectral_similarity.py @@ -6,11 +6,10 @@ import numpy as np import pandas as pd from koinapy import Koina -from scipy.stats import pearsonr, spearmanr from optimhc import utils from optimhc.feature_generator.base_feature_generator import BaseFeatureGenerator -from optimhc.feature_generator.numba_utils import align_peaks +from optimhc.feature_generator.numba_utils import align_peaks, compute_similarity_features from optimhc.parser import extract_mzml_data logger = logging.getLogger(__name__) @@ -561,359 +560,28 @@ def _align_spectra( return top_exp_intensity, top_pred_intensity, top_matched_indices - def _normalize_vector_l2(self, vector: np.ndarray) -> np.ndarray: - """ - Normalize a vector using L2 normalization (unit vector). - - Parameters - ---------- - vector : np.ndarray - Input vector to normalize. - - Returns - ------- - np.ndarray - L2-normalized vector. - - Notes - ----- - If the input vector has zero norm, the original vector is returned unchanged. - """ - norm = np.linalg.norm(vector) # Calculate the L2 norm (Euclidean norm) - if norm == 0: - return vector - return vector / norm - - def _normalize_vector_sum(self, vector: np.ndarray) -> np.ndarray: - """ - Perform sum normalization (probability normalization) on a vector. - - Parameters - ---------- - vector : np.ndarray - Input vector. - - Returns - ------- - np.ndarray - Sum normalized vector (sum to 1). - - Notes - ----- - If the input vector has zero sum, the original vector is returned unchanged. - """ - total = np.sum(vector) - if total > 0: - return vector / total - return vector - - def _calculate_spectral_angle_similarity( - self, exp_vector: np.ndarray, pred_vector: np.ndarray - ) -> float: - """ - Calculate the spectral angle between experimental and predicted vectors. - - Normalize the angle to [0, 1] where 1 is the best similarity. - - Parameters - ---------- - exp_vector : np.ndarray - Experimental intensity vector. - pred_vector : np.ndarray - Predicted intensity vector. - - Returns - ------- - float - Spectral angle similarity (0-1, higher is better). - - Notes - ----- - The spectral angle is calculated as: SA = 1 - (2 * angle / π), - where angle is the angle between the normalized vectors in radians. - """ - exp_norm = self._normalize_vector_l2(exp_vector) - pred_norm = self._normalize_vector_l2(pred_vector) - dot_product = np.sum(exp_norm * pred_norm) - - # Clamp dot product to [-1, 1] to avoid numerical issues - dot_product = np.clip(dot_product, -1.0, 1.0) - angle = np.arccos(dot_product) # Angle in radians - - # Return similarity score: SA = 1 - (2 * angle / π) - return 1 - (2 * angle / np.pi) - - def _calculate_cosine_similarity( - self, exp_vector: np.ndarray, pred_vector: np.ndarray - ) -> float: - """ - Calculate the cosine similarity between experimental and predicted vectors. - - Parameters - ---------- - exp_vector : np.ndarray - Experimental intensity vector. - pred_vector : np.ndarray - Predicted intensity vector. - - Returns - ------- - float - Cosine similarity (0-1, higher is better). - - Notes - ----- - The cosine similarity is calculated as 1 - cosine_distance. - If either vector has zero sum, returns 0.0. - """ - if np.sum(exp_vector) == 0 or np.sum(pred_vector) == 0: - return 0.0 - - exp_norm = self._normalize_vector_l2(exp_vector) - pred_norm = self._normalize_vector_l2(pred_vector) - - return float(np.dot(exp_norm, pred_norm)) - - def _calculate_spearman_correlation( - self, exp_vector: np.ndarray, pred_vector: np.ndarray - ) -> float: - """ - Calculate Spearman correlation coefficient between experimental and predicted vectors. - - Parameters - ---------- - exp_vector : np.ndarray - Experimental intensity vector. - pred_vector : np.ndarray - Predicted intensity vector. - - Returns - ------- - float - Spearman correlation coefficient (-1 to 1, higher is better). - - Notes - ----- - If either vector has no variation (std = 0), returns 0.0. - """ - # Handle vectors with no variation - if np.std(exp_vector) == 0 or np.std(pred_vector) == 0: - return 0.0 - - try: - r, _ = spearmanr(exp_vector, pred_vector) - return r - except Exception: - return - - def _calculate_pearson_correlation( - self, exp_vector: np.ndarray, pred_vector: np.ndarray - ) -> float: - """ - Calculate Pearson correlation coefficient between experimental and predicted vectors. - - Parameters - ---------- - exp_vector : np.ndarray - Experimental intensity vector. - pred_vector : np.ndarray - Predicted intensity vector. - - Returns - ------- - float - Pearson correlation coefficient (-1 to 1, higher is better). - - Notes - ----- - If either vector has no variation (std = 0), returns 0.0. - """ - # Handle vectors with no variation - if np.std(exp_vector) == 0 or np.std(pred_vector) == 0: - return 0.0 - - try: - r, _ = pearsonr(exp_vector, pred_vector) - return r - except Exception: - return 0.0 - - def _calculate_mean_squared_error( - self, exp_vector: np.ndarray, pred_vector: np.ndarray - ) -> float: - """ - Calculate mean squared error between experimental and predicted vectors. - - Parameters - ---------- - exp_vector : np.ndarray - Experimental intensity vector. - pred_vector : np.ndarray - Predicted intensity vector. - - Returns - ------- - float - Mean squared error (lower is better). - - Notes - ----- - The vectors are normalized using L2 normalization before calculating - the mean squared error for fair comparison. - """ - # Normalize vectors for fair comparison using L2 normalization - exp_norm = self._normalize_vector_l2(exp_vector) - pred_norm = self._normalize_vector_l2(pred_vector) - - return np.mean((exp_norm - pred_norm) ** 2) - - def _calculate_entropy(self, vector: np.ndarray) -> float: - """ - Calculate Shannon entropy of a vector that has already been sum-1 normalized. - - Parameters - ---------- - vector : np.ndarray - Input vector, which has already been sum-1 normalized. - - Returns - ------- - float - Shannon entropy. - - Notes - ----- - Only non-zero probabilities are considered for entropy calculation. - If all probabilities are zero, returns 0.0. - """ - # Only consider non-zero probabilities for entropy calculation - mask = vector > 0 - if not np.any(mask): - return 0.0 - - prob_vector = vector[mask] - return -np.sum(prob_vector * np.log(prob_vector)) - - def _calculate_unweighted_entropy_similarity( - self, exp_vector: np.ndarray, pred_vector: np.ndarray - ) -> float: - """ - Calculate unweighted spectral entropy between experimental and predicted vectors. - - Parameters - ---------- - exp_vector : np.ndarray - Experimental intensity vector. - pred_vector : np.ndarray - Predicted intensity vector. - - Returns - ------- - float - Spectral entropy similarity. - - Notes - ----- - Based on the method described in https://www.nature.com/articles/s41592-021-01331-z. - The spectral entropy is calculated using the formula: - 1 - (2*S_PM - S_P - S_M)/ln(4), where S_PM is the entropy of the mixed - distribution, and S_P and S_M are the entropies of the individual distributions. - """ - # Sum-to-1 normalization - sum_normalized_exp_vector = self._normalize_vector_sum(exp_vector) - sum_normalized_pred_vector = self._normalize_vector_sum(pred_vector) - s_exp = self._calculate_entropy(sum_normalized_exp_vector) - s_pred = self._calculate_entropy(sum_normalized_pred_vector) - s_mixed = self._calculate_entropy( - 0.5 * (sum_normalized_exp_vector + sum_normalized_pred_vector) - ) - - # Calculate spectral entropy using formula: 1 - (2*S_PM - S_P - S_M)/ln(4) - unweighted_entropy_similarity = 1.0 - (2 * s_mixed - s_exp - s_pred) / np.log(4) - - return unweighted_entropy_similarity - - # TODO: Assign weight to each peak based on the entropy of a spectrum. - # The original paper uses 3 as a entropy cutoff to assign more weight to the low intensity peaks. - # But the cutoff value is determined by a small-molecular dataset rather than a proteomics dataset. - # Thus, we need to design our own heuristic algorithm to calculate a similar feature for proteomics dataset. - # We can refer to the practice of MSBooster. - # url: https://github.com/Nesvilab/MSBooster/blame/master/src/main/java/features/spectra/SpectrumComparison.java#L803 - - # def _calculate_entropy_similarity(self, exp_vector: np.ndarray, pred_vector: np.ndarray) -> float: - - def _calculate_predicted_counts( - self, exp_vector: np.ndarray, pred_vector: np.ndarray - ) -> Tuple[int, int]: - """ - Calculate counts of predicted peaks seen/not seen in experimental spectrum. - - Parameters - ---------- - exp_vector : np.ndarray - Experimental intensity vector. - pred_vector : np.ndarray - Predicted intensity vector. - - Returns - ------- - tuple of (int, int) - - predicted_seen_nonzero: Number of predicted peaks that are also present in experimental spectrum - - predicted_not_seen: Number of predicted peaks that are not present in experimental spectrum - """ - predicted_seen_nonzero = np.sum((pred_vector > 0) & (exp_vector > 0)) - predicted_not_seen = np.sum((pred_vector > 0) & (exp_vector == 0)) - - return predicted_seen_nonzero, predicted_not_seen - def _calculate_similarity_features( self, exp_vector: np.ndarray, pred_vector: np.ndarray ) -> Dict[str, float]: """ Calculate all similarity features between experimental and predicted vectors. - Parameters - ---------- - exp_vector : np.ndarray - Experimental intensity vector. - pred_vector : np.ndarray - Predicted intensity vector. - - Returns - ------- - dict of str to float - Dictionary of similarity features, including: - - spectral_angle_similarity: Spectral angle similarity (0-1) - - cosine_similarity: Cosine similarity (0-1) - - pearson_correlation: Pearson correlation (-1 to 1) - - spearman_correlation: Spearman correlation (-1 to 1) - - mean_squared_error: Mean squared error - - unweighted_entropy_similarity: Spectral entropy similarity - - predicted_seen_nonzero: Number of predicted peaks seen in experimental spectrum - - predicted_not_seen: Number of predicted peaks not seen in experimental spectrum + Uses a single numba-compiled function that computes all metrics in one + pass, reusing shared intermediate values (L2 norms, sums, ranks). """ - spectral_angle_similarity = self._calculate_spectral_angle_similarity( - exp_vector, pred_vector - ) - cosine_similarity = self._calculate_cosine_similarity(exp_vector, pred_vector) - pearson_correlation = self._calculate_pearson_correlation(exp_vector, pred_vector) - spearman_correlation = self._calculate_spearman_correlation(exp_vector, pred_vector) - mean_squared_error = self._calculate_mean_squared_error(exp_vector, pred_vector) - unweighted_entropy_similarity = self._calculate_unweighted_entropy_similarity( - exp_vector, pred_vector - ) - predicted_seen_nonzero, predicted_not_seen = self._calculate_predicted_counts( - exp_vector, pred_vector + (sa, cos, pear, spear, mse, ent, seen, not_seen) = compute_similarity_features( + np.ascontiguousarray(exp_vector, dtype=np.float64), + np.ascontiguousarray(pred_vector, dtype=np.float64), ) return { - "spectral_angle_similarity": spectral_angle_similarity, - "cosine_similarity": cosine_similarity, - "pearson_correlation": pearson_correlation, - "spearman_correlation": spearman_correlation, - "mean_squared_error": mean_squared_error, - "unweighted_entropy_similarity": unweighted_entropy_similarity, - "predicted_seen_nonzero": predicted_seen_nonzero, - "predicted_not_seen": predicted_not_seen, + "spectral_angle_similarity": sa, + "cosine_similarity": cos, + "pearson_correlation": pear, + "spearman_correlation": spear, + "mean_squared_error": mse, + "unweighted_entropy_similarity": ent, + "predicted_seen_nonzero": int(seen), + "predicted_not_seen": int(not_seen), } def _generate_features(self) -> pd.DataFrame: @@ -949,69 +617,104 @@ def _generate_features(self) -> pd.DataFrame: logger.warning("Some PSMs were not found in experimental spectral data") psm_df = pd.merge(psm_df, pred_spectra_df, on=["processed_peptide", "charge"], how="inner") - results = [] - - logger.info("Matching experimental and predicted spectra... This may take a while.") - for _, row in psm_df.iterrows(): - exp_mz = row["mz"] - exp_intensity = row["intensity"] - pred_mz = row["pred_mz"] - pred_intensity = row["pred_intensity"] - pred_annotation = row["annotation"] if "annotation" in row else None - - # Align all peaks - ( - all_exp_intensity, - all_pred_intensity, - all_matched_indices, - additional_info, - ) = self._align_spectra_all_peaks( - exp_mz, - exp_intensity, - pred_mz, - pred_intensity, - pred_annotation, - ) - # Extract top N peaks for similarity calculations - top_exp_intensity, top_pred_intensity, top_matched_indices = ( - self._get_top_peaks_vectors( - all_exp_intensity, - all_pred_intensity, - all_matched_indices, - self.top_n, - ) + n_rows = len(psm_df) + logger.info(f"Matching experimental and predicted spectra for {n_rows} PSMs...") + + exp_mz_col = psm_df["mz"].values + exp_int_col = psm_df["intensity"].values + pred_mz_col = psm_df["pred_mz"].values + pred_int_col = psm_df["pred_intensity"].values + has_annotation = "annotation" in psm_df.columns + pred_ann_col = psm_df["annotation"].values if has_annotation else None + + sa_arr = np.empty(n_rows, dtype=np.float64) + cos_arr = np.empty(n_rows, dtype=np.float64) + pear_arr = np.empty(n_rows, dtype=np.float64) + spear_arr = np.empty(n_rows, dtype=np.float64) + mse_arr = np.empty(n_rows, dtype=np.float64) + ent_arr = np.empty(n_rows, dtype=np.float64) + seen_arr = np.empty(n_rows, dtype=np.int64) + not_seen_arr = np.empty(n_rows, dtype=np.int64) + + exp_vectors = [None] * n_rows + pred_vectors = [None] * n_rows + matched_indices_list = [None] * n_rows + exp_top_vectors = [None] * n_rows + pred_top_vectors = [None] * n_rows + top_matched_list = [None] * n_rows + exp_mz_sorted_list = [None] * n_rows + exp_int_sorted_list = [None] * n_rows + pred_mz_sorted_list = [None] * n_rows + pred_int_sorted_list = [None] * n_rows + pred_ann_sorted_list = [None] * n_rows if has_annotation else None + + for i in range(n_rows): + pred_ann = pred_ann_col[i] if pred_ann_col is not None else None + + all_exp, all_pred, all_matched, info = self._align_spectra_all_peaks( + exp_mz_col[i], + exp_int_col[i], + pred_mz_col[i], + pred_int_col[i], + pred_ann, ) - similarity_features = self._calculate_similarity_features( - top_exp_intensity, top_pred_intensity + top_exp, top_pred, top_matched = self._get_top_peaks_vectors( + all_exp, + all_pred, + all_matched, + self.top_n, ) - result = { - "exp_vector": all_exp_intensity.tolist(), - "pred_vector": all_pred_intensity.tolist(), - "matched_indices": all_matched_indices, - "exp_top_vector": top_exp_intensity.tolist(), - "pred_top_vector": top_pred_intensity.tolist(), - "top_matched_indices": top_matched_indices, - **similarity_features, - "exp_mz_sorted": additional_info["exp_mz_sorted"].tolist(), - "exp_intensity_sorted": additional_info["exp_intensity_sorted"].tolist(), - "pred_mz_sorted": additional_info["pred_mz_sorted"].tolist(), - "pred_intensity_sorted": additional_info["pred_intensity_sorted"].tolist(), - } - - # Add annotations if available - if additional_info["pred_annotation_sorted"] is not None: - result["pred_annotation_sorted"] = additional_info[ - "pred_annotation_sorted" - ].tolist() - - results.append(result) - results_df = pd.DataFrame(results) - psm_df = pd.concat( - [psm_df.reset_index(drop=True), results_df.reset_index(drop=True)], axis=1 - ) + (sa, cos, pear, spear, mse, ent, seen, not_seen) = compute_similarity_features( + np.ascontiguousarray(top_exp, dtype=np.float64), + np.ascontiguousarray(top_pred, dtype=np.float64), + ) + sa_arr[i] = sa + cos_arr[i] = cos + pear_arr[i] = pear + spear_arr[i] = spear + mse_arr[i] = mse + ent_arr[i] = ent + seen_arr[i] = int(seen) + not_seen_arr[i] = int(not_seen) + + exp_vectors[i] = all_exp + pred_vectors[i] = all_pred + matched_indices_list[i] = all_matched + exp_top_vectors[i] = top_exp + pred_top_vectors[i] = top_pred + top_matched_list[i] = top_matched + exp_mz_sorted_list[i] = info["exp_mz_sorted"] + exp_int_sorted_list[i] = info["exp_intensity_sorted"] + pred_mz_sorted_list[i] = info["pred_mz_sorted"] + pred_int_sorted_list[i] = info["pred_intensity_sorted"] + if pred_ann_sorted_list is not None: + pred_ann_sorted_list[i] = info["pred_annotation_sorted"] + + psm_df = psm_df.reset_index(drop=True) + psm_df["spectral_angle_similarity"] = sa_arr + psm_df["cosine_similarity"] = cos_arr + psm_df["pearson_correlation"] = pear_arr + psm_df["spearman_correlation"] = spear_arr + psm_df["mean_squared_error"] = mse_arr + psm_df["unweighted_entropy_similarity"] = ent_arr + psm_df["predicted_seen_nonzero"] = seen_arr + psm_df["predicted_not_seen"] = not_seen_arr + + psm_df["exp_vector"] = exp_vectors + psm_df["pred_vector"] = pred_vectors + psm_df["matched_indices"] = matched_indices_list + psm_df["exp_top_vector"] = exp_top_vectors + psm_df["pred_top_vector"] = pred_top_vectors + psm_df["top_matched_indices"] = top_matched_list + psm_df["exp_mz_sorted"] = exp_mz_sorted_list + psm_df["exp_intensity_sorted"] = exp_int_sorted_list + psm_df["pred_mz_sorted"] = pred_mz_sorted_list + psm_df["pred_intensity_sorted"] = pred_int_sorted_list + if pred_ann_sorted_list is not None: + psm_df["pred_annotation_sorted"] = pred_ann_sorted_list result_columns = [ "mz_file_path", @@ -1029,7 +732,7 @@ def _generate_features(self) -> pd.DataFrame: "exp_intensity_sorted", "pred_mz_sorted", "pred_intensity_sorted", - ("pred_annotation_sorted" if "pred_annotation_sorted" in psm_df.columns else None), + ("pred_annotation_sorted" if pred_ann_sorted_list is not None else None), "exp_vector", "pred_vector", "matched_indices",