From 0ec6642bfe4f273dc78874de048d2151e5b22f9e Mon Sep 17 00:00:00 2001 From: zshang Date: Fri, 6 Mar 2026 14:59:17 +0800 Subject: [PATCH 1/3] Fixes bug in PR #123, and update docstrings --- optimhc/rescore/mokapot.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/optimhc/rescore/mokapot.py b/optimhc/rescore/mokapot.py index 53cfc06..2909f97 100644 --- a/optimhc/rescore/mokapot.py +++ b/optimhc/rescore/mokapot.py @@ -30,11 +30,13 @@ def rescore( psms : PsmContainer A PsmContainer object containing PSM data. model : object, optional - A trained model for rescoring PSMs. + An untrained mokapot-compatible model (e.g. PercolatorModel, XGBoostPercolatorModel). + mokapot.brew trains it internally across cross-validation folds. If None, mokapot + uses its default PercolatorModel. rescoring_features : List[str], optional A list of feature names to use for rescoring. test_fdr : float, optional - The FDR threshold for testing the model. Default is 0.01. + The FDR threshold used to evaluate and report results after training. Default is 0.01. **kwargs : dict Additional keyword arguments for mokapot.brew. @@ -47,19 +49,18 @@ def rescore( (i.e. PSMs, peptides) when assessed using the learned score. If a list, they will be in the same order as provided in the psms parameter. - list of Model objects: - The learned Model objects, one for each fold. + The trained Model objects, one for each cross-validation fold. Notes ----- This function: - 1. Converts the PsmContainer to a mokapot dataset - 2. Runs mokapot.brew with the specified parameters - 3. Returns the results and models + 1. Converts the PsmContainer to a mokapot LinearPsmDataset + 2. Passes the dataset and untrained model to mokapot.brew, which trains across folds + 3. Returns the per-fold confidence results and trained models """ psms = convert_to_mokapot_dataset(psms, rescoring_features=rescoring_features) logger.info("Rescoring PSMs with mokapot.") - model_arg = [model] if model is not None else None - results, models = mokapot.brew(psms, model=model_arg, test_fdr=test_fdr, **kwargs) + results, models = mokapot.brew(psms, model=model, test_fdr=test_fdr, **kwargs) return results, models From 50a8aca2d5ff8015cbbda23a64c43454337a4031 Mon Sep 17 00:00:00 2001 From: zshang Date: Sat, 7 Mar 2026 15:57:02 +0800 Subject: [PATCH 2/3] fix minor bugs --- optimhc/cli.py | 7 +------ optimhc/core/config.py | 5 +++-- optimhc/core/pipeline.py | 12 ++++-------- optimhc/feature_generator/PWM.py | 4 ++-- optimhc/feature_generator/mhcflurry.py | 10 +--------- optimhc/utils.py | 5 +---- optimhc/visualization/plot_features.py | 2 +- optimhc/visualization/save_or_show_plot.py | 1 + 8 files changed, 14 insertions(+), 32 deletions(-) diff --git a/optimhc/cli.py b/optimhc/cli.py index f39222f..2f7c589 100644 --- a/optimhc/cli.py +++ b/optimhc/cli.py @@ -26,11 +26,6 @@ def cli(): pass -def parse_cli_config(**kwargs): - # Remove None values and build a config dict - return {k: v for k, v in kwargs.items() if v is not None and v != ()} - - @cli.command() @click.option( "--config", @@ -129,7 +124,7 @@ def pipeline( if visualization is not None: pipeline_config["visualization"] = visualization if numprocesses: - pipeline_config["numProcess"] = numprocesses + pipeline_config["numProcesses"] = numprocesses if allele: pipeline_config["allele"] = list(allele) if loglevel: diff --git a/optimhc/core/config.py b/optimhc/core/config.py index e3dfa4b..a88fabf 100644 --- a/optimhc/core/config.py +++ b/optimhc/core/config.py @@ -16,7 +16,7 @@ "saveModels": True, "toFlashLFQ": True, "allele": [], - "numProcess": 4, + "numProcesses": 4, "removePreNxtAA": False, "showProgress": True, "logLevel": "INFO", @@ -213,7 +213,8 @@ def validate(self): input_files = self._config["inputFile"] if not isinstance(input_files, (list, tuple)): logger.debug(f"inputFile is not a list or tuple: {input_files}. Converting to list.") - self._config["inputFile"] = list(input_files) + self._config["inputFile"] = [input_files] + input_files = self._config["inputFile"] if not input_files: logger.error("inputFile list cannot be empty") raise ValueError("inputFile list cannot be empty") diff --git a/optimhc/core/pipeline.py b/optimhc/core/pipeline.py index 6574ec4..32b5653 100644 --- a/optimhc/core/pipeline.py +++ b/optimhc/core/pipeline.py @@ -281,6 +281,8 @@ def _run_single_experiment(self, psms, exp_config, exp_name, exp_dir): bool True if experiment succeeded, False otherwise. """ + results = None + models = None try: os.makedirs(exp_dir, exist_ok=True) @@ -333,12 +335,8 @@ def _run_single_experiment(self, psms, exp_config, exp_name, exp_dir): return False finally: - # Explicit resource release to free up memory after each experiment - try: - del results - del models - except Exception: - pass + del results + del models gc.collect() def run(self): @@ -382,8 +380,6 @@ def run_experiments(self): psms = self.read_input() psms = self._generate_features(psms) - - # Save the generated pin file for reference pin_path = os.path.join(self.output_dir, f"optimhc.{self.experiment}.pin") psms.write_pin(pin_path) fig_summary_dir = os.path.join(self.output_dir, "figures") diff --git a/optimhc/feature_generator/PWM.py b/optimhc/feature_generator/PWM.py index bf6793c..84b541d 100644 --- a/optimhc/feature_generator/PWM.py +++ b/optimhc/feature_generator/PWM.py @@ -284,7 +284,7 @@ def _default_allele_pwm_files(self) -> Dict[str, Dict[int, str]]: logger.debug(f"Default PWM file paths set for alleles: {self.alleles}") return pwm_files - def _most_conserved_postions(self, pwm: pd.DataFrame, n: int = 2) -> List[int]: + def _most_conserved_positions(self, pwm: pd.DataFrame, n: int = 2) -> List[int]: """ Find the n most conserved positions in the PWM. @@ -635,7 +635,7 @@ def generate_features(self) -> pd.DataFrame: min_mer = min(self.pwms[allele].keys()) max_mer = max(self.pwms[allele].keys()) for mer_len in range(min_mer, max_mer + 1): - anchor_dict[mer_len] = self._most_conserved_postions( + anchor_dict[mer_len] = self._most_conserved_positions( self.pwms[allele][mer_len], self.anchors ) logger.info(f"Most conserved positions for allele {allele}: {anchor_dict}") diff --git a/optimhc/feature_generator/mhcflurry.py b/optimhc/feature_generator/mhcflurry.py index d5fbe07..1cc3b30 100644 --- a/optimhc/feature_generator/mhcflurry.py +++ b/optimhc/feature_generator/mhcflurry.py @@ -297,15 +297,7 @@ def generate_features(self) -> pd.DataFrame: ] if features_df.isna().sum().sum() > 0: logger.warning("NaN values found in the generated features.") - return features_df[ - [ - "Peptide", - "mhcflurry_affinity", - "mhcflurry_processing_score", - "mhcflurry_presentation_score", - "mhcflurry_presentation_percentile", - ] - ] + return features_df def get_best_allele(self) -> pd.DataFrame: """ diff --git a/optimhc/utils.py b/optimhc/utils.py index 5ee1be1..3a4e51f 100644 --- a/optimhc/utils.py +++ b/optimhc/utils.py @@ -1,5 +1,6 @@ # utils.py +import re from logging import getLogger from pathlib import Path from typing import List @@ -80,8 +81,6 @@ def strip_flanking_and_charge(peptide: str) -> str: This function removes any amino acids before the first '.' and after the last '.' in the peptide sequence. """ - import re - peptide = re.sub(r"^[^.]*\.|\.[^.]*$", "", peptide) # Some PIN may have charge state at the end of the peptide, e.g., R.RRVEHHDHAVVSGR4.L @@ -120,8 +119,6 @@ def remove_modifications(peptide: str, keep_modification=None) -> str: If keep_modification is provided, only those specific modifications will be preserved in the output sequence. """ - import re - if keep_modification is None: return re.sub(r"\[.*?\]", "", peptide) else: diff --git a/optimhc/visualization/plot_features.py b/optimhc/visualization/plot_features.py index 6a61412..bddb375 100644 --- a/optimhc/visualization/plot_features.py +++ b/optimhc/visualization/plot_features.py @@ -120,7 +120,7 @@ def plot_feature_importance( source_colors = dict(zip(rescoring_features.keys(), colors)) for source, features in rescoring_features.items(): - color = source_colors[source] # 修改:使用预分配的颜色 + color = source_colors[source] indices = [ i for i, name in enumerate(sum(rescoring_features.values(), [])) if name in features ] diff --git a/optimhc/visualization/save_or_show_plot.py b/optimhc/visualization/save_or_show_plot.py index 4264177..26d89db 100644 --- a/optimhc/visualization/save_or_show_plot.py +++ b/optimhc/visualization/save_or_show_plot.py @@ -12,3 +12,4 @@ def save_or_show_plot(save_path, logger, tight_layout=True): logger.info(f"Plot saved to {save_path}") else: plt.show() + plt.close("all") From 7f83279ad05a2803600ce409d27b5d38777b0c61 Mon Sep 17 00:00:00 2001 From: zshang Date: Sat, 7 Mar 2026 15:58:08 +0800 Subject: [PATCH 3/3] Fix typo in test configuration: changed 'numProcess' to 'numProcesses' --- tests/test_config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_config.py b/tests/test_config.py index 3ca8f50..54be89d 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -130,6 +130,6 @@ def test_default_config_values(self): assert DEFAULT_CONFIG["inputType"] == "pepxml" assert DEFAULT_CONFIG["outputDir"] == "./results" assert DEFAULT_CONFIG["decoyPrefix"] == "DECOY_" - assert DEFAULT_CONFIG["numProcess"] == 4 + assert DEFAULT_CONFIG["numProcesses"] == 4 assert DEFAULT_CONFIG["rescore"]["testFDR"] == 0.01 assert DEFAULT_CONFIG["rescore"]["model"] == "Percolator"