diff --git a/README.md b/README.md
index ad7acd7..7808759 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
# OptiMHC
-**An optimum rescoring pipeline for immunopeptidomics data that significantly enhances peptide identification performance.**
+**An optimized rescoring pipeline for immunopeptidomics data that significantly enhances peptide identification performance.**
OptiMHC integrates multiple rescoring features with machine learning-based rescoring to maximize the number of confidently identified peptides from mass spectrometry experiments.
@@ -240,12 +240,6 @@ Here are some examples:
-### GUI (Experimental)
-
-```bash
-optimhc gui
-```
-
### Full CLI Help
```bash
diff --git a/docs/api/core.md b/docs/api/core.md
index 744a49c..e5aa4c1 100644
--- a/docs/api/core.md
+++ b/docs/api/core.md
@@ -24,12 +24,6 @@
options:
members: true
-## Logging
-
-::: optimhc.core.logging_helper
- options:
- members: true
-
## Utilities
::: optimhc.utils
diff --git a/docs/api/features.md b/docs/api/features.md
index 99f852a..6b92874 100644
--- a/docs/api/features.md
+++ b/docs/api/features.md
@@ -2,54 +2,54 @@
## Base Class
-::: optimhc.feature_generator.base_feature_generator
+::: optimhc.feature.base_feature_generator
options:
members: true
## Basic
-::: optimhc.feature_generator.basic
+::: optimhc.feature.basic
options:
members: true
## Spectral Similarity
-::: optimhc.feature_generator.spectral_similarity
+::: optimhc.feature.spectral_similarity
options:
members: true
## DeepLC
-::: optimhc.feature_generator.DeepLC
+::: optimhc.feature.deeplc
options:
members: true
## Overlapping Peptide
-::: optimhc.feature_generator.overlapping_peptide
+::: optimhc.feature.overlapping_peptide
options:
members: true
## PWM
-::: optimhc.feature_generator.PWM
+::: optimhc.feature.pwm
options:
members: true
## MHCflurry
-::: optimhc.feature_generator.mhcflurry
+::: optimhc.feature.mhcflurry
options:
members: true
## NetMHCpan
-::: optimhc.feature_generator.netMHCpan
+::: optimhc.feature.netmhcpan
options:
members: true
## NetMHCIIpan
-::: optimhc.feature_generator.netMHCIIpan
+::: optimhc.feature.netmhciipan
options:
members: true
diff --git a/docs/development/index.md b/docs/development/index.md
index cfe6391..63e4f42 100644
--- a/docs/development/index.md
+++ b/docs/development/index.md
@@ -20,7 +20,7 @@ uv sync --locked --group dev
Alternatively, using pip:
```bash
-pip install -e ".[gui]"
+pip install -e .
pip install pytest ruff pre-commit
```
@@ -42,7 +42,7 @@ uv run ruff format . # Format
uv run ruff format --check . # Check without modifying
```
-Configuration: `line-length = 99`, rules `["E", "F", "I"]` (pycodestyle errors, pyflakes, isort). `E501` (line too long) is ignored. Ruff excludes `docs/`, `examples/`, and `optimhc/gui/`.
+Configuration: `line-length = 99`, rules `["E", "F", "I"]` (pycodestyle errors, pyflakes, isort). `E501` (line too long) is ignored. Ruff excludes `docs/` and `examples/`.
## Pre-commit Hooks
diff --git a/docs/getting-started/installation.md b/docs/getting-started/installation.md
index 50611e4..926650d 100644
--- a/docs/getting-started/installation.md
+++ b/docs/getting-started/installation.md
@@ -71,7 +71,4 @@ netMHCIIpan -v # Should print the version number
optimhc --help
```
-You should see the available commands: `pipeline`, `experiment`, and `gui`.
-
-!!! note "GUI"
- The Streamlit GUI is currently under development.
+You should see the available commands: `pipeline` and `experiment`.
diff --git a/docs/index.md b/docs/index.md
index 73ce52c..041865c 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -1,6 +1,6 @@
# OptiMHC
-**An optimum rescoring pipeline for immunopeptidomics data that significantly enhances peptide identification performance.**
+**An optimized rescoring pipeline for immunopeptidomics data that significantly enhances peptide identification performance.**
OptiMHC integrates multiple rescoring features with machine learning-based rescoring to maximize the number of confidently identified peptides from mass spectrometry experiments.
diff --git a/optimhc/cli.py b/optimhc/cli.py
index 2f7c589..fae3e67 100644
--- a/optimhc/cli.py
+++ b/optimhc/cli.py
@@ -1,27 +1,55 @@
-import importlib.util
import json
import logging
-import os
-import sys
import click
+from optimhc import __version__
from optimhc.core import Pipeline
from optimhc.core.config import Config
-logging.basicConfig(
- level=logging.INFO,
- format="%(asctime)s %(levelname)s %(name)s: %(message)s",
- handlers=[logging.StreamHandler()],
-)
-
logger = logging.getLogger(__name__)
+LOG_MAPPING = {
+ "DEBUG": logging.DEBUG,
+ "INFO": logging.INFO,
+ "WARNING": logging.WARNING,
+ "ERROR": logging.ERROR,
+ "CRITICAL": logging.CRITICAL,
+}
+
+
+def setup_logging(level: str = "INFO") -> None:
+ if level not in LOG_MAPPING:
+ raise ValueError(f"Invalid log level: {level}")
+ logging.basicConfig(
+ level=LOG_MAPPING[level],
+ format="%(asctime)s [%(levelname)s] %(name)s - %(message)s",
+ datefmt="%Y-%m-%d %H:%M:%S",
+ force=True,
+ )
+
+ # mhctools attaches its own INFO-level handlers to its loggers
+ # https://github.com/openvax/mhctools/blob/master/mhctools/logging.conf
+ for name in [
+ "mhctools",
+ "mhctools.base_commandline_predictor",
+ "mhctools.netmhc",
+ "mhctools.netmhciipan",
+ "mhctools.process_helpers",
+ "mhctools.cleanup_context",
+ ]:
+ lg = logging.getLogger(name)
+ lg.handlers.clear()
+ lg.disabled = True
+ lg.propagate = False
+ lg.setLevel(logging.CRITICAL)
+
@click.group()
+@click.version_option(version=__version__, prog_name="optimhc")
def cli():
"""
- optiMHC - A high-performance rescoring pipeline for immunopeptidomics data.
+ OptiMHC - A optimized rescoring pipeline for immunopeptidomics data.
"""
pass
@@ -106,13 +134,8 @@ def pipeline(
model,
):
"""Run the optiMHC pipeline with the specified configuration."""
- # Load configuration
- if config:
- pipeline_config = Config(config)
- else:
- pipeline_config = Config()
+ pipeline_config = Config(config) if config else Config()
- # Override with command-line parameters
if inputtype:
pipeline_config["inputType"] = inputtype
if inputfile:
@@ -143,10 +166,9 @@ def pipeline(
if model:
pipeline_config["rescore"]["model"] = model
- # Run pipeline
+ setup_logging(pipeline_config["logLevel"])
pipeline_config.validate()
- pipeline = Pipeline(pipeline_config)
- pipeline.run()
+ Pipeline(pipeline_config).run()
@cli.command()
@@ -158,63 +180,10 @@ def pipeline(
)
def experiment(config):
"""Run multiple experiments with different feature combinations."""
- # Load configuration
pipeline_config = Config(config)
+ setup_logging(pipeline_config["logLevel"])
- # Run experiments
- pipeline = Pipeline(pipeline_config)
- pipeline.run_experiments()
-
-
-@cli.command()
-def gui():
- """Launch the optiMHC GUI."""
- if importlib.util.find_spec("streamlit") is None:
- print("Error: Streamlit is not installed. Install GUI dependencies with:")
- print("pip install optimhc[gui]")
- return
-
- import subprocess
-
- # Get the path to the GUI app
- gui_path = os.path.join(os.path.dirname(__file__), "gui", "app.py")
-
- if not os.path.exists(gui_path):
- print(f"Error: GUI application not found at {gui_path}")
- return
-
- # Create a temporary launcher script that uses the correct imports
- import tempfile
-
- launcher_content = """
-import os
-import sys
-import streamlit
-
-# Add the root directory to the path
-sys.path.insert(0, '{}')
-
-# Import the app module properly
-from optimhc.gui.app import main
-
-if __name__ == "__main__":
- main()
- """.format(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
-
- fd, temp_path = tempfile.mkstemp(suffix=".py")
- with os.fdopen(fd, "w") as f:
- f.write(launcher_content)
-
- # Launch Streamlit with the temporary script
- print("Starting optiMHC GUI...")
- try:
- subprocess.run([sys.executable, "-m", "streamlit", "run", temp_path])
- finally:
- # Clean up the temporary file
- try:
- os.unlink(temp_path)
- except OSError:
- pass
+ Pipeline(pipeline_config).run_experiments()
if __name__ == "__main__":
diff --git a/optimhc/core/config.py b/optimhc/core/config.py
index a88fabf..89c6d18 100644
--- a/optimhc/core/config.py
+++ b/optimhc/core/config.py
@@ -59,11 +59,15 @@ def load_config(config_path):
Load and parse a configuration file using YAML.
Merges loaded config with default configuration.
- Parameters:
- config_path (str): Path to the YAML configuration file.
+ Parameters
+ ----------
+ config_path : str
+ Path to the YAML configuration file.
- Returns:
- dict: A dictionary containing all configurations.
+ Returns
+ -------
+ dict
+ A dictionary containing all configurations.
"""
logger.info(f"Loading configuration from {config_path}")
with open(config_path, "r") as f:
diff --git a/optimhc/core/feature_generation.py b/optimhc/core/feature_generation.py
index d83fff6..e9cf448 100644
--- a/optimhc/core/feature_generation.py
+++ b/optimhc/core/feature_generation.py
@@ -1,30 +1,12 @@
-"""
-feature_generation.py
-
-Implements feature generation logic for optiMHC, supporting multiple feature generators
-(Basic, OverlappingPeptide, PWM, MHCflurry, NetMHCpan, NetMHCIIpan, DeepLC, SpectralSimilarity, etc.).
-"""
-
import gc
import logging
-import os
-import re
-
-from optimhc.feature_generator.netMHCIIpan import NetMHCIIpanFeatureGenerator
-# The reason why we need to import the feature generators here is that
-# the package 'mhctools' affect the logging configuration of optiMHC.
-# TODO: find a better way to handle this.
-from optimhc.feature_generator.netMHCpan import NetMHCpanFeatureGenerator
+import optimhc.feature # noqa: F401 -- triggers generator registration
+from optimhc.feature.factory import feature_generator_factory
logger = logging.getLogger(__name__)
-# TODO: refactor the code to pass config as a parameter to the generators
-# TODO: factory method for feature generators
-# TODO: for allele-specific generators, we need to test the validation of the allele input first
-
-
def generate_features(psms, config):
"""
Generate features from different generators according to the configuration.
@@ -35,321 +17,21 @@ def generate_features(psms, config):
A container object holding PSMs and relevant data.
config : dict
Configuration dictionary loaded from YAML or CLI.
-
- Returns
- -------
- None
- Features are added in-place to the PsmContainer.
-
- Examples
- --------
- >>> generate_features(psms, config)
"""
- remove_modification = True
- remove_pre_nxt_aa = config["removePreNxtAA"]
- n_processes = config["numProcesses"]
- show_progress = config["showProgress"]
- mod_dict = config.get("modificationMap", None)
- if mod_dict == {}:
- mod_dict = None
feature_generators = config.get("featureGenerator", None)
- allele = config.get("allele", None)
- unique_peptides = list(set(psms.peptides))
-
- if feature_generators is not None:
- for generator_config in feature_generators:
- if not isinstance(generator_config, dict):
- logger.warning("Feature generator config is not a dictionary, skipping...")
- continue
-
- generator_type = generator_config.get("name")
- logger.info(f"Generating features with {generator_type}...")
- generator_params = generator_config.get("params", {})
-
- if generator_type == "OverlappingPeptide":
- from optimhc.feature_generator.overlapping_peptide import (
- OverlappingPeptideFeatureGenerator,
- assign_brother_aggregated_feature,
- )
-
- overlapping_peptide = OverlappingPeptideFeatureGenerator(
- unique_peptides,
- min_overlap_length=generator_params.get("minOverlapLength", 8),
- min_length=generator_params.get("minLength", 8),
- max_length=generator_params.get("maxLength", 25),
- remove_pre_nxt_aa=remove_pre_nxt_aa,
- remove_modification=remove_modification,
- )
- overlapping_features = overlapping_peptide.generate_features()
- full_data = overlapping_peptide.get_full_data()
-
- psms.add_metadata(
- full_data[["Peptide", "contig_member_count", "ContigSequence"]],
- psms_key=psms.peptide_column,
- metadata_key="Peptide",
- source="OverlappingPeptide",
- )
- psms.add_features(
- overlapping_features,
- psms_key=psms.peptide_column,
- feature_key=overlapping_peptide.id_column,
- source="OverlappingPeptide",
- )
- score = generator_params.get("overlappingScore", None)
- if score:
- assign_brother_aggregated_feature(
- psms,
- feature_columns=score,
- overlapping_source="OverlappingPeptide",
- source_name="ContigFeatures",
- )
-
- del overlapping_peptide, overlapping_features, full_data
- gc.collect()
-
- elif generator_type == "Basic":
- from optimhc.feature_generator.basic import BasicFeatureGenerator
-
- basic_generator = BasicFeatureGenerator(
- psms.psms[psms.peptide_column].tolist(),
- remove_pre_nxt_aa=remove_pre_nxt_aa,
- remove_modification=remove_modification,
- )
- basic_features = basic_generator.generate_features()
- psms.add_features_by_index(
- basic_features[basic_generator.feature_columns], source="Basic"
- )
-
- del basic_generator, basic_features
- gc.collect()
-
- elif generator_type == "PWM":
- from optimhc.feature_generator.PWM import PWMFeatureGenerator
-
- pwm_generator = PWMFeatureGenerator(
- unique_peptides,
- alleles=allele,
- mhc_class=generator_params.get("class", "I"),
- remove_modification=remove_modification,
- remove_pre_nxt_aa=remove_pre_nxt_aa,
- )
- pwm_features = pwm_generator.generate_features()
- psms.add_features(
- pwm_features,
- psms_key=psms.peptide_column,
- feature_key=pwm_generator.id_column,
- source="PWM",
- )
-
- del pwm_generator, pwm_features
- gc.collect()
-
- elif generator_type == "MHCflurry":
- from optimhc.feature_generator.mhcflurry import (
- MHCflurryFeatureGenerator,
- )
-
- mhcflurry_generator = MHCflurryFeatureGenerator(
- unique_peptides,
- alleles=allele,
- remove_pre_nxt_aa=remove_pre_nxt_aa,
- remove_modification=remove_modification,
- )
- mhcflurry_features = mhcflurry_generator.generate_features()
- psms.add_features(
- mhcflurry_features,
- psms_key=psms.peptide_column,
- feature_key=mhcflurry_generator.id_column,
- source="MHCflurry",
- )
-
- del mhcflurry_generator, mhcflurry_features
- gc.collect()
-
- elif generator_type == "NetMHCpan":
- # from optimhc.feature_generator.netMHCpan import NetMHCpanFeatureGenerator
- netmhcpan_generator = NetMHCpanFeatureGenerator(
- unique_peptides,
- alleles=allele,
- mode=generator_params.get("mode", "best"),
- remove_pre_nxt_aa=remove_pre_nxt_aa,
- remove_modification=remove_modification,
- n_processes=n_processes,
- show_progress=show_progress,
- )
- netmhcpan_features = netmhcpan_generator.generate_features()
- psms.add_features(
- netmhcpan_features,
- psms_key=psms.peptide_column,
- feature_key=netmhcpan_generator.id_column,
- source="NetMHCpan",
- )
-
- del netmhcpan_generator, netmhcpan_features
- gc.collect()
-
- elif generator_type == "NetMHCIIpan":
- # from optimhc.feature_generator.netMHCIIpan import NetMHCIIpanFeatureGenerator
- netmhciipan_generator = NetMHCIIpanFeatureGenerator(
- unique_peptides,
- alleles=allele,
- mode=generator_params.get("mode", "best"),
- remove_pre_nxt_aa=remove_pre_nxt_aa,
- remove_modification=remove_modification,
- n_processes=n_processes,
- show_progress=show_progress,
- )
- netmhciipan_features = netmhciipan_generator.generate_features()
- psms.add_features(
- netmhciipan_features,
- psms_key=psms.peptide_column,
- feature_key=netmhciipan_generator.id_column,
- source="NetMHCIIpan",
- )
-
- del netmhciipan_generator, netmhciipan_features
- gc.collect()
-
- elif generator_type == "DeepLC":
- from optimhc.feature_generator.DeepLC import DeepLCFeatureGenerator
-
- deeplc_generator = DeepLCFeatureGenerator(
- psms,
- calibration_criteria_column=generator_params.get("calibrationCriteria"),
- lower_score_is_better=generator_params.get("lowerIsBetter"),
- calibration_set_size=generator_params.get("calibrationSize", 0.1),
- processes=n_processes,
- # TODO: Check here carefully
- # Since DeepLC is GPU-based,
- # the processes here is not the same meaning as the n_processes in multi-threading
- model_path=generator_params.get("model_path", None),
- remove_pre_nxt_aa=remove_pre_nxt_aa,
- mod_dict=mod_dict,
- )
- deeplc_features = deeplc_generator.generate_features()
- psms.add_features_by_index(
- deeplc_features[deeplc_generator.feature_columns], source="DeepLC"
- )
-
- del deeplc_generator, deeplc_features
- gc.collect()
-
- elif generator_type == "SpectralSimilarity":
- from optimhc.feature_generator.spectral_similarity import (
- SpectralSimilarityFeatureGenerator,
- )
-
- # Match PSMs with the spectra
- mzML_dir = generator_params.get("mzmlDir", None)
- if mzML_dir is None:
- logger.error(
- "mzML_dir is not provided for SpectralSimilarity feature generator."
- )
- continue
-
- pattern = generator_params.get("spectrumIdPattern", None)
- mz_file_names = []
- spectrum_ids = psms.spectrum_ids
-
- if pattern:
- logger.info(
- f"Using pattern: {pattern} to extract mzML file names from spectrum IDs."
- )
- for spectrum_id in spectrum_ids:
- mz_file_names.append(re.match(pattern, spectrum_id).group(1))
- logger.info(f"mzML file names: {list(set(mz_file_names))}")
- else:
- logger.info("Spectrum ID pattern is not provided.")
- if psms.ms_data_file_column is not None:
- logger.info(
- f"Trying to extract mzML file names from {psms.ms_data_file_column}"
- )
- logger.info(
- f"MS data file format: {set(psms.psms[psms.ms_data_file_column])}"
- )
-
- for ms_data_file in psms.psms[psms.ms_data_file_column]:
- mz_file_basename = os.path.basename(ms_data_file).split(".")[0]
- if mz_file_basename.endswith(".mzML"):
- mz_file_basename = mz_file_basename[:-5]
- elif mz_file_basename.endswith("mzML"):
- mz_file_basename = mz_file_basename[:-4]
- mz_file_names.append(mz_file_basename)
-
- logger.info(f"mzML file names: {list(set(mz_file_names))}")
- else:
- logger.info("MS data file information is not provided.")
- logger.info(
- r"Trying to use the default pattern: (.+?)\.\d+\.\d+\.\d+ to extract mzML file names from spectrum IDs."
- )
- for spectrum_id in spectrum_ids:
- mz_file_names.append(
- re.match(r"(.+?)\.\d+\.\d+\.\d+", spectrum_id).group(1)
- )
-
- mz_file_paths = [
- os.path.join(mzML_dir, f"{mz_file}.mzML") for mz_file in mz_file_names
- ]
- mz_file_paths_set = set(mz_file_paths)
- logger.info(f"mz_file_paths: {mz_file_paths_set}")
-
- for mz_file_path in mz_file_paths_set:
- if not os.path.exists(mz_file_path):
- logger.error(f"mzML file not found: {mz_file_path}")
- continue
-
- model_type = generator_params.get("model", None)
- if model_type is None:
- logger.error(
- "Model type is not provided for SpectralSimilarity feature generator."
- )
- raise ValueError(
- "Model type is required for SpectralSimilarity feature generator."
- )
-
- collision_energy = generator_params.get("collisionEnergy", None)
- instrument = generator_params.get("instrument", None)
- fragmentation_type = generator_params.get("fragmentationType", None)
- spectral_similarity_generator = SpectralSimilarityFeatureGenerator(
- spectrum_ids=psms.spectrum_ids,
- peptides=psms.peptides,
- charges=psms.charges,
- scan_ids=psms.scan_ids,
- mz_file_paths=mz_file_paths,
- model_type=generator_params.get("model"),
- collision_energies=(
- [collision_energy] * len(psms.peptides) if collision_energy else None
- ),
- instruments=([instrument] * len(psms.peptides) if instrument else None),
- fragmentation_types=(
- [fragmentation_type] * len(psms.peptides) if fragmentation_type else None
- ),
- remove_pre_nxt_aa=remove_pre_nxt_aa,
- mod_dict=mod_dict,
- url=generator_params.get("url"),
- ssl=generator_params.get("ssl", True),
- top_n=generator_params.get("numTopPeaks", 36),
- tolerance_ppm=generator_params.get("tolerance", 20),
- )
-
- spectral_similarity_features = spectral_similarity_generator.generate_features()
- psms.add_features(
- spectral_similarity_features,
- psms_key=[
- psms.spectrum_column,
- psms.peptide_column,
- psms.charge_column,
- ],
- feature_key=spectral_similarity_generator.id_column,
- source="SpectralSimilarity",
- )
- del (
- spectral_similarity_generator,
- spectral_similarity_features,
- mz_file_paths,
- mz_file_names,
- )
- gc.collect()
-
- else:
- logger.warning(f"Unknown feature generator: {generator_type}, skipping...")
+ if not feature_generators:
+ return
+
+ for generator_config in feature_generators:
+ if not isinstance(generator_config, dict):
+ logger.warning("Feature generator config is not a dictionary, skipping...")
+ continue
+
+ name = generator_config.get("name")
+ params = generator_config.get("params", {})
+
+ logger.info(f"Generating features with {name}...")
+ generator_cls = feature_generator_factory.get_generator(name)
+ generator = generator_cls.from_config(psms, config, params)
+ generator.apply(psms, source=name)
+ gc.collect()
diff --git a/optimhc/core/logging_helper.py b/optimhc/core/logging_helper.py
deleted file mode 100644
index 100cfc3..0000000
--- a/optimhc/core/logging_helper.py
+++ /dev/null
@@ -1,110 +0,0 @@
-import logging
-
-
-def setup_loggers(log_file=None, log_level="INFO"):
- """
- Create or update all loggers so that each logger has a StreamHandler and optionally a FileHandler.
- This ensures all log messages are displayed in the console and optionally saved to a file.
-
- Parameters
- ----------
- log_file : str, optional
- Path to the log file. If None, no file logging is set up.
- log_level : str, optional
- Logging level (DEBUG, INFO, WARNING, ERROR). Default is "INFO".
- """
- # Disable mhctools logging, avoid the warning message when multiprocessing
- for logger_name in [
- "mhctools",
- "mhctools.base_commandline_predictor",
- "mhctools.netmhc",
- "mhctools.netmhciipan",
- ]:
- logger = logging.getLogger(logger_name)
- logger.disabled = True
- logger.propagate = False
- logger.setLevel(logging.CRITICAL)
-
- loggers = [logging.getLogger(name) for name in logging.root.manager.loggerDict]
- level = getattr(logging, log_level.upper(), logging.INFO)
-
- # debug_logging()
-
- for lg in loggers:
- if lg.name.startswith("mhctools"):
- continue
-
- lg.disabled = False
- has_stream_handler = any(
- isinstance(handler, logging.StreamHandler) for handler in lg.handlers
- )
- if not has_stream_handler:
- console_handler = logging.StreamHandler()
- console_handler.setLevel(level)
- formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
- console_handler.setFormatter(formatter)
- lg.addHandler(console_handler)
-
- if log_file:
- has_file_handler = any(
- isinstance(handler, logging.FileHandler) for handler in lg.handlers
- )
- if not has_file_handler:
- file_handler = logging.FileHandler(log_file, mode="a")
- file_handler.setLevel(level)
- formatter = logging.Formatter(
- "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
- )
- file_handler.setFormatter(formatter)
- lg.addHandler(file_handler)
-
- lg.propagate = False
- lg.setLevel(level)
-
- if lg.name.startswith("optimhc"):
- lg.disabled = False
-
- root_logger = logging.getLogger()
- root_logger.disabled = False
- root_logger.setLevel(level)
-
-
-def debug_logging():
- """
- Print debugging information for all loggers that start with 'optimhc' and
- the root logger. This helps verify that logger configurations are set properly.
- """
- print("\n=== Debugging Loggers ===\n")
- loggers = [logging.getLogger(name) for name in logging.root.manager.loggerDict.keys()]
- for lg in loggers:
- if lg.name.startswith("optimhc"):
- print(f"Logger Name: {lg.name}")
- print(f" - Effective Level: {logging.getLevelName(lg.getEffectiveLevel())}")
- print(f" - Explicit Level: {logging.getLevelName(lg.level)} (default: NOTSET)")
- print(f" - Propagate: {lg.propagate}")
- print(f" - Disabled: {lg.disabled}")
-
- if lg.handlers:
- for handler in lg.handlers:
- print(f" Handler: {type(handler).__name__}")
- print(f" - Level: {logging.getLevelName(handler.level)}")
- print(f" - Formatter: {handler.formatter}")
- if isinstance(handler, logging.FileHandler):
- print(f" - Log File: {handler.baseFilename}")
- print(f" - Stream: {getattr(handler, 'stream', None)}")
- else:
- print(" No handlers attached to the logger.")
- print("")
-
- root_logger = logging.getLogger()
- print("Root Logger:")
- print(f" - Level: {logging.getLevelName(root_logger.level)}")
- print(f" - Handlers: {len(root_logger.handlers)}")
- for handler in root_logger.handlers:
- print(f" Handler: {type(handler).__name__}")
- print(f" - Level: {logging.getLevelName(handler.level)}")
- print(f" - Formatter: {handler.formatter}")
- if isinstance(handler, logging.FileHandler):
- print(f" - Log File: {handler.baseFilename}")
- print(f" - Stream: {getattr(handler, 'stream', None)}")
- print("\n=== End of Logger Debugging ===\n")
diff --git a/optimhc/core/pipeline.py b/optimhc/core/pipeline.py
index 32b5653..ab4fad8 100644
--- a/optimhc/core/pipeline.py
+++ b/optimhc/core/pipeline.py
@@ -15,7 +15,6 @@
from optimhc.core.config import Config
from optimhc.core.feature_generation import generate_features
-from optimhc.core.logging_helper import setup_loggers
from optimhc.parser import read_pepxml, read_pin
from optimhc.rescore import mokapot
from optimhc.rescore.model import RandomForestPercolatorModel, XGBoostPercolatorModel
@@ -67,7 +66,6 @@ def __init__(self, config):
self.experiment = self.config.get("experimentName", "optimhc_experiment")
self.output_dir = os.path.join(self.config["outputDir"], self.experiment)
os.makedirs(self.output_dir, exist_ok=True)
- setup_loggers(os.path.join(self.output_dir, "log"), self.config.get("logLevel", "INFO"))
self.visualization_enabled = self.config.get("visualization", True)
self.save_models = self.config.get("saveModels", True)
diff --git a/optimhc/feature/__init__.py b/optimhc/feature/__init__.py
new file mode 100644
index 0000000..c69db0c
--- /dev/null
+++ b/optimhc/feature/__init__.py
@@ -0,0 +1,27 @@
+from optimhc.feature.base_feature_generator import BaseFeatureGenerator
+from optimhc.feature.basic import BasicFeatureGenerator
+from optimhc.feature.deeplc import DeepLCFeatureGenerator
+from optimhc.feature.factory import feature_generator_factory
+from optimhc.feature.mhcflurry import MHCflurryFeatureGenerator
+from optimhc.feature.netmhciipan import NetMHCIIpanFeatureGenerator
+from optimhc.feature.netmhcpan import NetMHCpanFeatureGenerator
+from optimhc.feature.overlapping_peptide import (
+ OverlappingPeptideFeatureGenerator,
+)
+from optimhc.feature.pwm import PWMFeatureGenerator
+from optimhc.feature.spectral_similarity import (
+ SpectralSimilarityFeatureGenerator,
+)
+
+__all__ = [
+ "feature_generator_factory",
+ "BaseFeatureGenerator",
+ "BasicFeatureGenerator",
+ "PWMFeatureGenerator",
+ "OverlappingPeptideFeatureGenerator",
+ "MHCflurryFeatureGenerator",
+ "NetMHCpanFeatureGenerator",
+ "NetMHCIIpanFeatureGenerator",
+ "DeepLCFeatureGenerator",
+ "SpectralSimilarityFeatureGenerator",
+]
diff --git a/optimhc/feature/base_feature_generator.py b/optimhc/feature/base_feature_generator.py
new file mode 100644
index 0000000..8d9424f
--- /dev/null
+++ b/optimhc/feature/base_feature_generator.py
@@ -0,0 +1,80 @@
+from abc import ABC, abstractmethod
+from typing import List
+
+import pandas as pd
+
+from optimhc.psm_container import PsmContainer
+
+
+class BaseFeatureGenerator(ABC):
+ """Abstract base class for all feature generators in the rescoring pipeline.
+
+ Subclasses must implement:
+ - ``feature_columns`` -- names of generated feature columns
+ - ``id_column`` -- merge key column(s)
+ - ``generate_features()`` -- pure computation, returns a DataFrame
+ - ``from_config()`` -- construct an instance from pipeline config
+
+ The default ``apply()`` merges features by peptide column.
+ Override it for index-based merges, composite keys, or post-processing.
+ """
+
+ @property
+ @abstractmethod
+ def feature_columns(self) -> List[str]:
+ """Return a list of feature column names produced by this generator."""
+ ...
+
+ @property
+ @abstractmethod
+ def id_column(self) -> List[str]:
+ """Return the column(s) used as merge key(s) with the PsmContainer."""
+ ...
+
+ @abstractmethod
+ def generate_features(self) -> pd.DataFrame:
+ """Generate features and return them as a DataFrame."""
+ ...
+
+ @classmethod
+ def from_config(
+ cls,
+ psms: PsmContainer,
+ config: dict,
+ params: dict,
+ ) -> "BaseFeatureGenerator":
+ """Construct a generator instance from pipeline configuration.
+
+ Parameters
+ ----------
+ psms : PsmContainer
+ The PSM container with all current data.
+ config : dict
+ The full pipeline configuration.
+ params : dict
+ Generator-specific parameters from
+ ``config["featureGenerator"][i]["params"]``.
+ """
+ raise NotImplementedError(f"{cls.__name__} must implement from_config()")
+
+ def apply(self, psms: PsmContainer, source: str) -> None:
+ """Generate features and merge them into the PsmContainer.
+
+ The default implementation merges by peptide column using
+ ``add_features()``. Override for different merge strategies
+ (index-based, composite key) or additional post-processing.
+
+ Parameters
+ ----------
+ psms : PsmContainer
+ The PSM container to add features to (modified in-place).
+ source : str
+ Name of this feature source (e.g. ``"Basic"``, ``"PWM"``).
+ """
+ features = self.generate_features()
+ psms.add_features(
+ features,
+ psms_key=psms.peptide_column,
+ feature_key=self.id_column,
+ source=source,
+ )
diff --git a/optimhc/feature_generator/basic.py b/optimhc/feature/basic.py
similarity index 85%
rename from optimhc/feature_generator/basic.py
rename to optimhc/feature/basic.py
index 2d29472..2e0fde3 100644
--- a/optimhc/feature_generator/basic.py
+++ b/optimhc/feature/basic.py
@@ -1,5 +1,3 @@
-# feature_generator/basic.py
-
import logging
from typing import List
@@ -7,7 +5,8 @@
from scipy.stats import entropy # Import entropy from scipy
from optimhc import utils
-from optimhc.feature_generator.base_feature_generator import BaseFeatureGenerator
+from optimhc.feature.base_feature_generator import BaseFeatureGenerator
+from optimhc.feature.factory import feature_generator_factory
logger = logging.getLogger(__name__)
@@ -85,11 +84,15 @@ def _preprocess_peptide(self, peptide: str) -> str:
"""
Preprocess peptide sequence by removing adjacent amino acids and modifications.
- Parameters:
- peptide (str): Original peptide sequence.
+ Parameters
+ ----------
+ peptide : str
+ Original peptide sequence.
- Returns:
- str: Preprocessed peptide sequence.
+ Returns
+ -------
+ str
+ Preprocessed peptide sequence.
"""
if self.remove_pre_nxt_aa:
peptide = utils.strip_flanking_and_charge(peptide)
@@ -101,11 +104,15 @@ def _shannon_entropy(self, sequence: str) -> float:
"""
Calculate the Shannon entropy of a peptide sequence.
- Parameters:
- sequence (str): Peptide sequence.
+ Parameters
+ ----------
+ sequence : str
+ Peptide sequence.
- Returns:
- float: Shannon entropy value.
+ Returns
+ -------
+ float
+ Shannon entropy value.
"""
if len(sequence) == 0:
return 0.0
@@ -162,3 +169,18 @@ def generate_features(self) -> pd.DataFrame:
logger.info(f"Generated basic features for {len(features_df)} peptides.")
return features_df
+
+ @classmethod
+ def from_config(cls, psms, config, params):
+ return cls(
+ peptides=psms.peptides,
+ remove_pre_nxt_aa=config["removePreNxtAA"],
+ remove_modification=True,
+ )
+
+ def apply(self, psms, source):
+ features = self.generate_features()
+ psms.add_features_by_index(features[self.feature_columns], source=source)
+
+
+feature_generator_factory.register_generator("Basic", BasicFeatureGenerator)
diff --git a/optimhc/feature_generator/DeepLC.py b/optimhc/feature/deeplc.py
similarity index 92%
rename from optimhc/feature_generator/DeepLC.py
rename to optimhc/feature/deeplc.py
index 0060e8a..348f791 100644
--- a/optimhc/feature_generator/DeepLC.py
+++ b/optimhc/feature/deeplc.py
@@ -1,4 +1,3 @@
-# feature_generator/DeepLC.py
# TODO: Use koina for prediction
import logging
@@ -9,7 +8,8 @@
from deeplc import DeepLC
from optimhc import utils
-from optimhc.feature_generator.base_feature_generator import BaseFeatureGenerator
+from optimhc.feature.base_feature_generator import BaseFeatureGenerator
+from optimhc.feature.factory import feature_generator_factory
from optimhc.psm_container import PsmContainer
logger = logging.getLogger(__name__)
@@ -78,28 +78,30 @@ def __init__(
DeepLC retraining is on by default. Add ``deeplc_retrain: False`` as a keyword argument to
disable retraining.
- Parameters:
- psms: PsmContainer
+ Parameters
+ ----------
+ psms : PsmContainer
PSMs to generate features for.
- calibration_criteria_column: str
+ calibration_criteria_column : str
Column name in the PSMs DataFrame to use for DeepLC calibration.
- lower_score_is_better
- Whether a lower PSM score denotes a better matching PSM. Default: False
- calibration_set_size: int or float
+ lower_score_is_better : bool
+ Whether a lower PSM score denotes a better matching PSM. Default: False.
+ calibration_set_size : int or float
Amount of best PSMs to use for DeepLC calibration. If this value is lower
than the number of available PSMs, all PSMs will be used. (default: 0.15)
- processes: {int, None}
+ processes : int or None
Number of processes to use in DeepLC. Defaults to 1.
- model_path: str
+ model_path : str
Path to the DeepLC model. If None, the default model will be used.
- remove_pre_nxt_aa: bool
+ remove_pre_nxt_aa : bool
Whether to remove the first and last amino acids from the peptide sequence.
- Default: True
- mod_dict: dict
- Dictionary of modifications to be used for DeepLC. If None, no modifications will be used.
- *args: list
+ Default: True.
+ mod_dict : dict
+ Dictionary of modifications to be used for DeepLC. If None, no modifications
+ will be used.
+ *args : list
Additional positional arguments are passed to DeepLC.
- kwargs: dict
+ **kwargs : dict
Additional keyword arguments are passed to DeepLC.
"""
self.psms = psms
@@ -456,3 +458,26 @@ def save_raw_predictions(self, file_path: str, **kwargs) -> None:
logger.info(f"Raw predictions saved to {file_path}")
else:
logger.warning("Raw predictions have not been generated yet.")
+
+ @classmethod
+ def from_config(cls, psms, config, params):
+ mod_dict = config.get("modificationMap", None)
+ if mod_dict == {}:
+ mod_dict = None
+ return cls(
+ psms=psms,
+ calibration_criteria_column=params.get("calibrationCriteria"),
+ lower_score_is_better=params.get("lowerIsBetter"),
+ calibration_set_size=params.get("calibrationSize", 0.1),
+ processes=config.get("numProcesses", 1),
+ model_path=params.get("model_path", None),
+ remove_pre_nxt_aa=config["removePreNxtAA"],
+ mod_dict=mod_dict,
+ )
+
+ def apply(self, psms, source):
+ features = self.generate_features()
+ psms.add_features_by_index(features[self.feature_columns], source=source)
+
+
+feature_generator_factory.register_generator("DeepLC", DeepLCFeatureGenerator)
diff --git a/optimhc/feature/factory.py b/optimhc/feature/factory.py
new file mode 100644
index 0000000..561c0c8
--- /dev/null
+++ b/optimhc/feature/factory.py
@@ -0,0 +1,37 @@
+import logging
+from typing import Dict, List, Type
+
+from optimhc.feature.base_feature_generator import BaseFeatureGenerator
+
+logger = logging.getLogger(__name__)
+
+
+class FeatureGeneratorFactory:
+ """Registry and factory for feature generators.
+
+ Each generator module registers itself at import time by calling
+ ``feature_generator_factory.register_generator(name, cls)``.
+ The orchestrator retrieves generators by name via ``get_generator(name)``.
+ """
+
+ def __init__(self):
+ self._registry: Dict[str, Type[BaseFeatureGenerator]] = {}
+
+ def register_generator(self, name: str, generator_class: Type[BaseFeatureGenerator]):
+ """Register a feature generator class under *name*."""
+ self._registry[name] = generator_class
+
+ def get_generator(self, name: str) -> Type[BaseFeatureGenerator]:
+ """Return the generator class registered under *name*."""
+ if name not in self._registry:
+ raise ValueError(
+ f"Unknown feature generator: '{name}'. Available: {sorted(self._registry.keys())}"
+ )
+ return self._registry[name]
+
+ def list_generators(self) -> List[str]:
+ """Return sorted list of registered generator names."""
+ return sorted(self._registry.keys())
+
+
+feature_generator_factory = FeatureGeneratorFactory()
diff --git a/optimhc/feature_generator/mhcflurry.py b/optimhc/feature/mhcflurry.py
similarity index 95%
rename from optimhc/feature_generator/mhcflurry.py
rename to optimhc/feature/mhcflurry.py
index 1cc3b30..d151f06 100644
--- a/optimhc/feature_generator/mhcflurry.py
+++ b/optimhc/feature/mhcflurry.py
@@ -5,7 +5,8 @@
from mhcflurry import Class1PresentationPredictor
from optimhc import utils
-from optimhc.feature_generator.base_feature_generator import BaseFeatureGenerator
+from optimhc.feature.base_feature_generator import BaseFeatureGenerator
+from optimhc.feature.factory import feature_generator_factory
logger = logging.getLogger(__name__)
@@ -338,3 +339,15 @@ def predictions_to_dataframe(self) -> pd.DataFrame:
if self.predictions is None:
raise ValueError("No predictions available. Please run 'generate_features' first.")
return self.predictions
+
+ @classmethod
+ def from_config(cls, psms, config, params):
+ return cls(
+ peptides=list(set(psms.peptides)),
+ alleles=config.get("allele", []),
+ remove_pre_nxt_aa=config["removePreNxtAA"],
+ remove_modification=True,
+ )
+
+
+feature_generator_factory.register_generator("MHCflurry", MHCflurryFeatureGenerator)
diff --git a/optimhc/feature_generator/netMHCIIpan.py b/optimhc/feature/netmhciipan.py
similarity index 97%
rename from optimhc/feature_generator/netMHCIIpan.py
rename to optimhc/feature/netmhciipan.py
index b65db9d..336fc5e 100644
--- a/optimhc/feature_generator/netMHCIIpan.py
+++ b/optimhc/feature/netmhciipan.py
@@ -1,5 +1,3 @@
-# feature_generator/netMHCIIpan.py
-
# TODO: set 'BA' and 'EL' as optional parameters for the user to choose the prediction method.
import logging
@@ -12,7 +10,8 @@
from tqdm import tqdm
from optimhc import utils
-from optimhc.feature_generator.base_feature_generator import BaseFeatureGenerator
+from optimhc.feature.base_feature_generator import BaseFeatureGenerator
+from optimhc.feature.factory import feature_generator_factory
logger = logging.getLogger(__name__)
@@ -647,3 +646,18 @@ def save_raw_predictions(self, file_path: str, **kwargs) -> None:
logger.info(f"Raw prediction results saved to: {file_path}")
else:
logger.warning("No raw prediction results available to save.")
+
+ @classmethod
+ def from_config(cls, psms, config, params):
+ return cls(
+ peptides=list(set(psms.peptides)),
+ alleles=config.get("allele", []),
+ mode=params.get("mode", "best"),
+ remove_pre_nxt_aa=config["removePreNxtAA"],
+ remove_modification=True,
+ n_processes=config.get("numProcesses", 1),
+ show_progress=config.get("showProgress", False),
+ )
+
+
+feature_generator_factory.register_generator("NetMHCIIpan", NetMHCIIpanFeatureGenerator)
diff --git a/optimhc/feature_generator/netMHCpan.py b/optimhc/feature/netmhcpan.py
similarity index 97%
rename from optimhc/feature_generator/netMHCpan.py
rename to optimhc/feature/netmhcpan.py
index 089bd6d..7816da6 100644
--- a/optimhc/feature_generator/netMHCpan.py
+++ b/optimhc/feature/netmhcpan.py
@@ -1,5 +1,3 @@
-# feature_generators/netmhcpan_feature_generator.py
-
# TODO: Except 'best' mode, the other modes seems to be not working properly. We need to investigate this issue.
import logging
@@ -12,6 +10,7 @@
from tqdm import tqdm
from optimhc import utils
+from optimhc.feature.factory import feature_generator_factory
from .base_feature_generator import BaseFeatureGenerator
@@ -674,3 +673,18 @@ def predictions_to_dataframe(self) -> pd.DataFrame:
# logger.info(f"Generated best allele information for {len(best_allele_df)} peptides.")
# return best_allele_df
+
+ @classmethod
+ def from_config(cls, psms, config, params):
+ return cls(
+ peptides=list(set(psms.peptides)),
+ alleles=config.get("allele", []),
+ mode=params.get("mode", "best"),
+ remove_pre_nxt_aa=config["removePreNxtAA"],
+ remove_modification=True,
+ n_processes=config.get("numProcesses", 1),
+ show_progress=config.get("showProgress", False),
+ )
+
+
+feature_generator_factory.register_generator("NetMHCpan", NetMHCpanFeatureGenerator)
diff --git a/optimhc/feature_generator/numba_utils.py b/optimhc/feature/numba_utils.py
similarity index 100%
rename from optimhc/feature_generator/numba_utils.py
rename to optimhc/feature/numba_utils.py
diff --git a/optimhc/feature_generator/overlapping_peptide.py b/optimhc/feature/overlapping_peptide.py
similarity index 89%
rename from optimhc/feature_generator/overlapping_peptide.py
rename to optimhc/feature/overlapping_peptide.py
index 42d4f04..d50faef 100644
--- a/optimhc/feature_generator/overlapping_peptide.py
+++ b/optimhc/feature/overlapping_peptide.py
@@ -1,5 +1,3 @@
-# feature_generator/overlapping_peptide.py
-
import logging
from collections import defaultdict
from typing import Dict, List, Tuple, Union
@@ -11,7 +9,8 @@
from tqdm import tqdm
from optimhc import utils
-from optimhc.feature_generator.base_feature_generator import BaseFeatureGenerator
+from optimhc.feature.base_feature_generator import BaseFeatureGenerator
+from optimhc.feature.factory import feature_generator_factory
from optimhc.psm_container import PsmContainer
logger = logging.getLogger(__name__)
@@ -688,111 +687,43 @@ def get_full_data(self) -> pd.DataFrame:
self.full_data = self.overlap_data.merge(full_data_df, on="clean_peptide", how="left")
return self.full_data
+ @classmethod
+ def from_config(cls, psms, config, params):
+ instance = cls(
+ peptides=list(set(psms.peptides)),
+ min_overlap_length=params.get("minOverlapLength", 8),
+ min_length=params.get("minLength", 8),
+ max_length=params.get("maxLength", 25),
+ remove_pre_nxt_aa=config["removePreNxtAA"],
+ remove_modification=True,
+ )
+ instance._overlapping_score = params.get("overlappingScore", None)
+ return instance
+
+ def apply(self, psms, source):
+ features = self.generate_features()
+ full_data = self.get_full_data()
+
+ psms.add_metadata(
+ full_data[["Peptide", "contig_member_count", "ContigSequence"]],
+ psms_key=psms.peptide_column,
+ metadata_key="Peptide",
+ source=source,
+ )
+ psms.add_features(
+ features,
+ psms_key=psms.peptide_column,
+ feature_key=self.id_column,
+ source=source,
+ )
-'''
-# TODO: test
-
-def assign_brother_aggregated_feature(
- psms: PsmContainer,
- feature_columns: Union[str, List[str]],
- overlapping_source: str,
- source_name: str = 'OverlappingGroupFeatures'
-) -> None:
- """
- Assign aggregated features based on brother peptides to the PSMs.
-
- For PSMs with the same ContigSequence (brother peptides), compute the mean of specified features
- and assign these aggregated features back to each PSM in the group.
- If a PSM does not have a ContigSequence (no brothers), its new features will be set to the original values.
-
- Metadata in the PSM container:
- {
- "source_name": {
- "metadata_field_1": "value1",
- "metadata_field_2": "value2"
- }
- }
-
- Parameters:
- psms (PsmContainer): PSM container containing the peptides and features.
- feature_columns (Union[str, List[str]]): Name of the feature column(s) to aggregate.
- overlapping_source (str): Source name of the overlapping peptide features.
- source_name (str): Name of the new feature source.
-
- Returns:
- None
- """
- if isinstance(feature_columns, str):
- feature_columns = [feature_columns]
- psms_df = psms.psms
-
- if psms.metadata_column is None:
- raise ValueError("The PSMs do not contain metadata.")
- metadata = psms_df[psms.metadata_column]
- print(metadata)
-
-
- def get_overlapping_data(x):
- try:
- return x.get(overlapping_source, {})
- except AttributeError:
- logger.error(f"Metadata for PSM {x} is not a dictionary.")
- return {}
-
- def get_contig_sequence(x):
- try:
- return x.get('ContigSequence', None)
- except AttributeError:
- logger.error(f"Invalid metadata for PSM {x}.")
- return None
-
- overlapping_data = metadata.apply(get_overlapping_data)
- contig_sequences = overlapping_data.apply(get_contig_sequence)
- print(overlapping_data)
- print(contig_sequences)
-
- psms_df['ContigSequence'] = contig_sequences
-
- for feature in feature_columns:
- if feature not in psms_df.columns:
- raise ValueError(f"Feature column '{feature}' not found in PSMs.")
-
- grouped_mean = psms_df.groupby('ContigSequence')[feature_columns].mean().reset_index()
- #grouped_sum = psms_df.groupby('ContigSequence')[feature_columns].sum().reset_index()
-
- """
- grouped = grouped_mean.merge(grouped_sum,
- on='ContigSequence',
- suffixes=('_brother_mean', '_brother_sum'))
- """
- psms_with_agg = psms_df.merge(grouped_mean,
- on='ContigSequence',
- how='left',
- suffixes=('', '_brother_mean'))
-
-
- # use the original feature values if the aggregated values are missing
- for feature in feature_columns:
- mean_feature = feature + '_brother_mean'
- sum_feature = feature + '_brother_sum'
- psms_with_agg[mean_feature].fillna(psms_with_agg[feature], inplace=True)
- psms_with_agg[sum_feature].fillna(psms_with_agg[feature], inplace=True)
-
-
- agg_feature_columns = []
- for feature in feature_columns:
- mean_feature = feature + '_brother_mean'
- sum_feature = feature + '_brother_sum'
- agg_feature_columns.append(mean_feature)
- agg_feature_columns.append(sum_feature)
-
- new_features_df = psms_with_agg[agg_feature_columns]
- new_features_df.columns = agg_feature_columns
-
- psms.add_features_by_index(new_features_df, source=source_name)
-
-
-'''
+ if self._overlapping_score:
+ assign_brother_aggregated_feature(
+ psms,
+ feature_columns=self._overlapping_score,
+ overlapping_source=source,
+ source_name="ContigFeatures",
+ )
def assign_brother_aggregated_feature(
@@ -880,3 +811,8 @@ def get_contig_sequence(x):
new_features_df = psms_with_agg[agg_feature_columns]
psms.add_features_by_index(features_df=new_features_df, source=source_name)
+
+
+feature_generator_factory.register_generator(
+ "OverlappingPeptide", OverlappingPeptideFeatureGenerator
+)
diff --git a/optimhc/feature_generator/PWM.py b/optimhc/feature/pwm.py
similarity index 98%
rename from optimhc/feature_generator/PWM.py
rename to optimhc/feature/pwm.py
index 84b541d..913298c 100644
--- a/optimhc/feature_generator/PWM.py
+++ b/optimhc/feature/pwm.py
@@ -1,5 +1,3 @@
-# feature_generator/PWM.py
-
import logging
import os
from typing import Dict, List, Optional, Tuple, Union
@@ -8,7 +6,8 @@
import pandas as pd
from optimhc import utils
-from optimhc.feature_generator.base_feature_generator import BaseFeatureGenerator
+from optimhc.feature.base_feature_generator import BaseFeatureGenerator
+from optimhc.feature.factory import feature_generator_factory
logger = logging.getLogger(__name__)
@@ -700,3 +699,16 @@ def feature_columns(self) -> List[str]:
feature_columns.append(f"N_Flank_PWM_Score_{allele}")
feature_columns.append(f"C_Flank_PWM_Score_{allele}")
return feature_columns
+
+ @classmethod
+ def from_config(cls, psms, config, params):
+ return cls(
+ peptides=list(set(psms.peptides)),
+ alleles=config.get("allele", []),
+ mhc_class=params.get("class", "I"),
+ remove_modification=True,
+ remove_pre_nxt_aa=config["removePreNxtAA"],
+ )
+
+
+feature_generator_factory.register_generator("PWM", PWMFeatureGenerator)
diff --git a/optimhc/feature_generator/spectral_similarity.py b/optimhc/feature/spectral_similarity.py
similarity index 75%
rename from optimhc/feature_generator/spectral_similarity.py
rename to optimhc/feature/spectral_similarity.py
index 698452f..9f3a273 100644
--- a/optimhc/feature_generator/spectral_similarity.py
+++ b/optimhc/feature/spectral_similarity.py
@@ -1,5 +1,3 @@
-# feature_generator/spectral_similarity.py
-
import logging
from typing import Dict, List, Optional, Tuple
@@ -8,8 +6,9 @@
from koinapy import Koina
from optimhc import utils
-from optimhc.feature_generator.base_feature_generator import BaseFeatureGenerator
-from optimhc.feature_generator.numba_utils import align_peaks, compute_similarity_features
+from optimhc.feature.base_feature_generator import BaseFeatureGenerator
+from optimhc.feature.factory import feature_generator_factory
+from optimhc.feature.numba_utils import align_peaks, compute_similarity_features
from optimhc.parser import extract_mzml_data
logger = logging.getLogger(__name__)
@@ -25,18 +24,30 @@ class SpectralSimilarityFeatureGenerator(BaseFeatureGenerator):
3. Align experimental and predicted spectra
4. Calculate similarity metrics as features
- Parameters:
- peptides (List[str]): List of peptide sequences
- charges (List[int]): List of charge states
- scan_ids (List[int]): List of scan IDs
- mz_file_paths (List[str]): List of mzML file paths
- model_type (str): Prediction model type, either "HCD" or "CID"
- collision_energies (List[float]): List of collision energies, required when model_type is "HCD"
- remove_pre_nxt_aa (bool): Whether to remove preceding and next amino acids, default is True
- remove_modification (bool): Whether to remove modifications, default is True
- url (str): Koina server URL, default is "koina.wilhelmlab.org:443"
- top_n (int): Number of top peaks to use for alignment, default is 12
- tolerance_ppm (float): Mass tolerance for alignment in ppm, default is 20
+ Parameters
+ ----------
+ peptides : list of str
+ List of peptide sequences.
+ charges : list of int
+ List of charge states.
+ scan_ids : list of int
+ List of scan IDs.
+ mz_file_paths : list of str
+ List of mzML file paths.
+ model_type : str
+ Prediction model type, either "HCD" or "CID".
+ collision_energies : list of float
+ List of collision energies, required when model_type is "HCD".
+ remove_pre_nxt_aa : bool
+ Whether to remove preceding and next amino acids, default is True.
+ remove_modification : bool
+ Whether to remove modifications, default is True.
+ url : str
+ Koina server URL, default is "koina.wilhelmlab.org:443".
+ top_n : int
+ Number of top peaks to use for alignment, default is 12.
+ tolerance_ppm : float
+ Mass tolerance for alignment in ppm, default is 20.
"""
def __init__(
@@ -89,7 +100,7 @@ def __init__(
)
self.df["processed_peptide"] = self.df["peptide"].apply(self._preprocess_peptide)
- logger.info(f"Recevied {len(self.df)} PSMs for spectral similarity feature generation")
+ logger.info(f"Received {len(self.df)} PSMs for spectral similarity feature generation")
@property
def id_column(self) -> List[str]:
@@ -118,8 +129,10 @@ def input_df(self) -> pd.DataFrame:
"""
Return the generated features as a DataFrame.
- Returns:
- pd.DataFrame: DataFrame containing the generated features
+ Returns
+ -------
+ pd.DataFrame
+ DataFrame containing the generated features.
"""
return self.df
@@ -304,38 +317,40 @@ def _predict_theoretical_spectra(
else:
raise ValueError(f"Unsupported model type: {self.model_type}")
- # Save the raw prediction results
- self._raw_predictions = predictions.copy()
-
- # Convert prediction results to a suitable format
- pred_df = predictions.copy()
- pred_df.rename(
- columns={
- "peptide_sequences": "processed_peptide",
- "precursor_charges": "charge",
- "intensities": "pred_intensity",
- "mz": "pred_mz",
- },
- inplace=True,
- )
+ # Save the raw prediction results
+ self._raw_predictions = predictions.copy()
+
+ # Convert prediction results to a suitable format
+ pred_df = predictions.copy()
+ pred_df.rename(
+ columns={
+ "peptide_sequences": "processed_peptide",
+ "precursor_charges": "charge",
+ "intensities": "pred_intensity",
+ "mz": "pred_mz",
+ },
+ inplace=True,
+ )
- # Group by peptide and charge, convert predicted mz and intensity to lists
- grouped_df = (
- pred_df.groupby(["processed_peptide", "charge"])
- .agg({"pred_intensity": list, "pred_mz": list, "annotation": list})
- .reset_index()
- )
+ # Group by peptide and charge, convert predicted mz and intensity to lists
+ grouped_df = (
+ pred_df.groupby(["processed_peptide", "charge"])
+ .agg({"pred_intensity": list, "pred_mz": list, "annotation": list})
+ .reset_index()
+ )
- logger.info(f"Successfully predicted {len(grouped_df)} theoretical spectra")
- return grouped_df
+ logger.info(f"Successfully predicted {len(grouped_df)} theoretical spectra")
+ return grouped_df
@property
def raw_predictions(self) -> pd.DataFrame:
"""
Returns the raw prediction results from Koina.
- Returns:
- pd.DataFrame: Raw prediction results DataFrame
+ Returns
+ -------
+ pd.DataFrame
+ Raw prediction results DataFrame.
"""
if self._raw_predictions is None:
if self.results is None:
@@ -346,8 +361,10 @@ def get_raw_predictions(self) -> pd.DataFrame:
"""
Get the raw prediction results DataFrame from Koina.
- Returns:
- pd.DataFrame: Raw prediction results DataFrame
+ Returns
+ -------
+ pd.DataFrame
+ Raw prediction results DataFrame.
"""
return self.raw_predictions
@@ -355,9 +372,12 @@ def save_raw_predictions(self, file_path: str, **kwargs) -> None:
"""
Save the raw prediction results to a file.
- Parameters:
- file_path (str): Path to save the file
- **kwargs: Other parameters passed to pandas.DataFrame.to_csv
+ Parameters
+ ----------
+ file_path : str
+ Path to save the file.
+ **kwargs
+ Other parameters passed to ``pandas.DataFrame.to_csv``.
"""
if "index" not in kwargs:
kwargs["index"] = False
@@ -415,20 +435,27 @@ def _align_spectra_all_peaks(
"""
Align experimental and predicted spectra using ppm tolerance.
- Parameters:
- exp_mz (List[float]): Experimental m/z values
- exp_intensity (List[float]): Experimental intensity values
- pred_mz (List[float]): Predicted m/z values
- pred_intensity (List[float]): Predicted intensity values
- pred_annotation (Optional[List[str]]): Predicted fragment annotations
-
- Returns:
- Tuple[np.ndarray, np.ndarray, np.ndarray, Dict]:
- - Aligned experimental intensity vector
- - Predicted intensity vector
- - Matching index pairs as int array of shape (N, 2),
- where column 0 is pred_idx and column 1 is exp_idx (-1 = no match)
- - Additional info including original sorted arrays
+ Parameters
+ ----------
+ exp_mz : list of float
+ Experimental m/z values.
+ exp_intensity : list of float
+ Experimental intensity values.
+ pred_mz : list of float
+ Predicted m/z values.
+ pred_intensity : list of float
+ Predicted intensity values.
+ pred_annotation : list of str, optional
+ Predicted fragment annotations.
+
+ Returns
+ -------
+ tuple of (np.ndarray, np.ndarray, np.ndarray, dict)
+ - Aligned experimental intensity vector
+ - Predicted intensity vector
+ - Matching index pairs as int array of shape (N, 2),
+ where column 0 is pred_idx and column 1 is exp_idx (-1 = no match)
+ - Additional info including original sorted arrays
"""
# Sort both experimental and predicted spectra by m/z
exp_mz_sorted, exp_intensity_sorted, _ = self._sort_spectrum_by_mz(exp_mz, exp_intensity)
@@ -479,19 +506,25 @@ def _get_top_peaks_vectors(
top_n: int,
) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
"""
- Extract top N peaks based on predicted intensity for similarity calculation
+ Extract top N peaks based on predicted intensity for similarity calculation.
- Parameters:
- aligned_exp_intensity (np.ndarray): Aligned experimental intensity vector
- aligned_pred_intensity (np.ndarray): Aligned predicted intensity vector
- matched_indices (np.ndarray): Matching index pairs, shape (N, 2)
- top_n (int): Number of top peaks to extract
+ Parameters
+ ----------
+ aligned_exp_intensity : np.ndarray
+ Aligned experimental intensity vector.
+ aligned_pred_intensity : np.ndarray
+ Aligned predicted intensity vector.
+ matched_indices : np.ndarray
+ Matching index pairs, shape (N, 2).
+ top_n : int
+ Number of top peaks to extract.
- Returns:
- Tuple[np.ndarray, np.ndarray, np.ndarray]:
- - Top N experimental intensity vector
- - Top N predicted intensity vector
- - Top N matching index pairs, shape (top_n, 2)
+ Returns
+ -------
+ tuple of (np.ndarray, np.ndarray, np.ndarray)
+ - Top N experimental intensity vector
+ - Top N predicted intensity vector
+ - Top N matching index pairs, shape (top_n, 2)
"""
num_peaks = min(top_n, len(aligned_pred_intensity))
top_pred_indices = np.argsort(-aligned_pred_intensity)[:num_peaks]
@@ -586,10 +619,12 @@ def _calculate_similarity_features(
def _generate_features(self) -> pd.DataFrame:
"""
- Generate spectral similarity features
+ Generate spectral similarity features.
- Returns:
- pd.DataFrame: DataFrame containing generated features
+ Returns
+ -------
+ pd.DataFrame
+ DataFrame containing generated features.
"""
psm_df = self.df.copy()
pred_spectra_df = self._predict_theoretical_spectra(
@@ -790,3 +825,99 @@ def get_full_data(self) -> pd.DataFrame:
results and raw data used in feature generation.
"""
return self.results
+
+ @staticmethod
+ def _resolve_mzml_paths(psms, params):
+ """Resolve per-PSM mzML file paths from config and PsmContainer."""
+ import os
+ import re
+
+ mzml_dir = params.get("mzmlDir", None)
+ if mzml_dir is None:
+ raise ValueError("mzmlDir is required for SpectralSimilarity feature generator.")
+
+ pattern = params.get("spectrumIdPattern", None)
+ mz_file_names = []
+ spectrum_ids = psms.spectrum_ids
+
+ if pattern:
+ logger.info(f"Using pattern: {pattern} to extract mzML file names from spectrum IDs.")
+ for spectrum_id in spectrum_ids:
+ mz_file_names.append(re.match(pattern, spectrum_id).group(1))
+ logger.info(f"mzML file names: {list(set(mz_file_names))}")
+ else:
+ logger.info("Spectrum ID pattern is not provided.")
+ if psms.ms_data_file_column is not None:
+ logger.info(f"Trying to extract mzML file names from {psms.ms_data_file_column}")
+ logger.info(f"MS data file format: {set(psms.psms[psms.ms_data_file_column])}")
+ for ms_data_file in psms.psms[psms.ms_data_file_column]:
+ mz_file_basename = os.path.basename(ms_data_file).split(".")[0]
+ if mz_file_basename.endswith(".mzML"):
+ mz_file_basename = mz_file_basename[:-5]
+ elif mz_file_basename.endswith("mzML"):
+ mz_file_basename = mz_file_basename[:-4]
+ mz_file_names.append(mz_file_basename)
+ logger.info(f"mzML file names: {list(set(mz_file_names))}")
+ else:
+ logger.info("MS data file information is not provided.")
+ logger.info(
+ r"Trying to use the default pattern: (.+?)\.\d+\.\d+\.\d+ "
+ "to extract mzML file names from spectrum IDs."
+ )
+ for spectrum_id in spectrum_ids:
+ mz_file_names.append(re.match(r"(.+?)\.\d+\.\d+\.\d+", spectrum_id).group(1))
+
+ mz_file_paths = [os.path.join(mzml_dir, f"{mz_file}.mzML") for mz_file in mz_file_names]
+ for mz_file_path in set(mz_file_paths):
+ if not os.path.exists(mz_file_path):
+ logger.error(f"mzML file not found: {mz_file_path}")
+
+ return mz_file_paths
+
+ @classmethod
+ def from_config(cls, psms, config, params):
+ mz_file_paths = cls._resolve_mzml_paths(psms, params)
+ mod_dict = config.get("modificationMap", None)
+ if mod_dict == {}:
+ mod_dict = None
+
+ model_type = params.get("model", None)
+ if model_type is None:
+ raise ValueError("Model type is required for SpectralSimilarity feature generator.")
+
+ n = len(psms.peptides)
+ collision_energy = params.get("collisionEnergy", None)
+ instrument = params.get("instrument", None)
+ fragmentation_type = params.get("fragmentationType", None)
+
+ return cls(
+ spectrum_ids=psms.spectrum_ids,
+ peptides=psms.peptides,
+ charges=psms.charges,
+ scan_ids=psms.scan_ids,
+ mz_file_paths=mz_file_paths,
+ model_type=model_type,
+ collision_energies=[collision_energy] * n if collision_energy else None,
+ instruments=[instrument] * n if instrument else None,
+ fragmentation_types=[fragmentation_type] * n if fragmentation_type else None,
+ remove_pre_nxt_aa=config["removePreNxtAA"],
+ mod_dict=mod_dict,
+ url=params.get("url"),
+ ssl=params.get("ssl", True),
+ top_n=params.get("numTopPeaks", 36),
+ tolerance_ppm=params.get("tolerance", 20),
+ )
+
+ def apply(self, psms, source):
+ features = self.generate_features()
+ psms.add_features(
+ features,
+ psms_key=[psms.spectrum_column, psms.peptide_column, psms.charge_column],
+ feature_key=self.id_column,
+ source=source,
+ )
+
+
+feature_generator_factory.register_generator(
+ "SpectralSimilarity", SpectralSimilarityFeatureGenerator
+)
diff --git a/optimhc/feature_generator/__init__.py b/optimhc/feature_generator/__init__.py
deleted file mode 100644
index 7d30ecc..0000000
--- a/optimhc/feature_generator/__init__.py
+++ /dev/null
@@ -1,3 +0,0 @@
-import warnings
-
-warnings.filterwarnings("ignore")
diff --git a/optimhc/feature_generator/base_feature_generator.py b/optimhc/feature_generator/base_feature_generator.py
deleted file mode 100644
index 7cac35c..0000000
--- a/optimhc/feature_generator/base_feature_generator.py
+++ /dev/null
@@ -1,35 +0,0 @@
-# feature_generators/base_feature_generator.py
-
-from abc import ABC, abstractmethod
-from typing import List
-
-import pandas as pd
-
-
-class BaseFeatureGenerator(ABC):
- """
- Abstract base class for all feature generators in the rescoring pipeline.
- """
-
- @property
- @abstractmethod
- def feature_columns(self) -> List[str]:
- """
- Returns a list of feature names generated by the feature generator.
- """
- pass
-
- @property
- @abstractmethod
- def id_column(self) -> List[str]:
- """
- Returns the column or columns used as key or keys to merge features with PSMs.
- """
- pass
-
- @abstractmethod
- def generate_features(self) -> pd.DataFrame:
- """
- Generates features.
- """
- pass
diff --git a/optimhc/gui/__init__.py b/optimhc/gui/__init__.py
deleted file mode 100644
index a750dbc..0000000
--- a/optimhc/gui/__init__.py
+++ /dev/null
@@ -1,5 +0,0 @@
-"""
-optiMHC GUI module for Streamlit-based interface.
-"""
-
-__all__ = ["app"]
diff --git a/optimhc/gui/app.py b/optimhc/gui/app.py
deleted file mode 100644
index 91db588..0000000
--- a/optimhc/gui/app.py
+++ /dev/null
@@ -1,76 +0,0 @@
-"""
-Main application for the optiMHC GUI.
-"""
-
-import streamlit as st
-import os
-import sys
-from pathlib import Path
-
-# Add optiMHC root to path if needed
-optimhc_root = str(Path(__file__).parent.parent.parent)
-if optimhc_root not in sys.path:
- sys.path.append(optimhc_root)
-
-# Import style utilities
-from optimhc.gui.style import set_page_config, apply_custom_css, footer
-
-# Import page modules
-from optimhc.gui.pages import home, configure, run, results
-
-
-def main():
- """
- Main application entry point.
- """
- # Set up page config
- set_page_config()
-
- # Apply custom CSS
- apply_custom_css()
-
- # Initialize session state for navigation
- if "page" not in st.session_state:
- st.session_state.page = "home"
-
- # Sidebar navigation
- st.sidebar.title("Navigation")
-
- # Navigation buttons
- if st.sidebar.button("Home", use_container_width=True):
- st.session_state.page = "home"
- st.rerun()
-
- if st.sidebar.button("Configure", use_container_width=True):
- st.session_state.page = "configure"
- st.rerun()
-
- if st.sidebar.button("Run Pipeline", use_container_width=True):
- st.session_state.page = "run"
- st.rerun()
-
- if st.sidebar.button("Results", use_container_width=True):
- st.session_state.page = "results"
- st.rerun()
-
- # Version info in sidebar
- st.sidebar.markdown("---")
- from optimhc import __version__
- st.sidebar.info(f"optiMHC v{__version__}")
-
- # Render the selected page
- if st.session_state.page == "home":
- home.render()
- elif st.session_state.page == "configure":
- configure.render()
- elif st.session_state.page == "run":
- run.render()
- elif st.session_state.page == "results":
- results.render()
-
- # Footer
- footer()
-
-
-if __name__ == "__main__":
- main()
diff --git a/optimhc/gui/components/__init__.py b/optimhc/gui/components/__init__.py
deleted file mode 100644
index 1338c94..0000000
--- a/optimhc/gui/components/__init__.py
+++ /dev/null
@@ -1,14 +0,0 @@
-"""
-GUI components for the optiMHC Streamlit interface.
-"""
-
-__all__ = [
- "config_form",
- "file_upload",
- "log_viewer",
- "pipeline_control",
- "results_viewer",
- "feature_generator_form",
- "rescore_form",
- "modification_map"
-]
diff --git a/optimhc/gui/components/config_form.py b/optimhc/gui/components/config_form.py
deleted file mode 100644
index 1344885..0000000
--- a/optimhc/gui/components/config_form.py
+++ /dev/null
@@ -1,399 +0,0 @@
-"""
-Configuration form component for optiMHC GUI.
-"""
-
-import os
-import streamlit as st
-import yaml
-from typing import Dict, Any, List, Optional
-import json
-
-# Import optiMHC config defaults
-from optimhc.core.config import DEFAULT_CONFIG
-
-
-def feature_generator_form(existing_generators: List[Dict[str, Any]] = None) -> List[Dict[str, Any]]:
- """
- Create a form for configuring feature generators.
-
- Args:
- existing_generators: List of existing feature generator configurations
-
- Returns:
- List of feature generator configurations
- """
- if existing_generators is None:
- existing_generators = []
-
- feature_generators = []
-
- # Known feature generators and their parameters
- generator_options = {
- "Basic": {},
- "PWM": {"class": ["I", "II"]},
- "MHCflurry": {},
- "NetMHCpan": {},
- "NetMHCIIpan": {},
- "DeepLC": {
- "calibrationCriteria": ["expect", "xcorr", "hyperscore"],
- "lowerIsBetter": [True, False],
- "calibrationSize": [0.1, 0.2, 0.3]
- },
- "SpectralSimilarity": {
- "model": ["AlphaPeptDeep_ms2_generic", "AlphaPeptDeep_ms2_HCD", "AlphaPeptDeep_ms2_CID"],
- "instrument": ["LUMOS", "QE", "VELOS", "FUSION"],
- "numTopPeaks": [10, 20, 36, 50]
- },
- "OverlappingPeptide": {
- "minOverlapLength": [7, 8, 9],
- "minLength": [7, 8, 9],
- "maxLength": [15, 20, 25],
- "overlappingScore": ["expect", "xcorr", "hyperscore"]
- }
- }
-
- st.subheader("Feature Generators")
-
- # Use session state to keep track of the number of generators
- if "num_generators" not in st.session_state:
- st.session_state.num_generators = max(1, len(existing_generators))
-
- # Add/remove generator controls outside of the form
- col1, col2 = st.columns([1, 5])
- with col1:
- if st.button("➕ Add Generator", key="add_generator"):
- st.session_state.num_generators += 1
- st.rerun()
- with col2:
- if st.session_state.num_generators > 1 and st.button("➖ Remove Last Generator", key="remove_generator"):
- st.session_state.num_generators -= 1
- st.rerun()
-
- # Generate forms for each feature generator
- for i in range(st.session_state.num_generators):
- with st.expander(f"Feature Generator {i+1}", expanded=True):
- existing_gen = {} if i >= len(existing_generators) else existing_generators[i]
-
- # Feature generator name
- generator_name = st.selectbox(
- "Generator Type",
- options=list(generator_options.keys()),
- key=f"gen_type_{i}",
- index=list(generator_options.keys()).index(existing_gen.get("name", "Basic")) if existing_gen.get("name") in generator_options else 0
- )
-
- # Feature generator parameters
- params = {}
- if generator_options[generator_name]:
- st.markdown("**Parameters:**")
- for param_name, param_options in generator_options[generator_name].items():
- existing_params = existing_gen.get("params", {})
- existing_value = existing_params.get(param_name)
-
- # Handle different parameter types
- if isinstance(param_options, list):
- if all(isinstance(x, bool) for x in param_options):
- param_value = st.checkbox(
- param_name,
- value=existing_value if existing_value is not None else param_options[0],
- key=f"gen_{i}_{param_name}"
- )
- elif all(isinstance(x, (int, float)) for x in param_options):
- param_value = st.number_input(
- param_name,
- value=existing_value if existing_value is not None else param_options[0],
- key=f"gen_{i}_{param_name}"
- )
- else:
- param_value = st.selectbox(
- param_name,
- options=param_options,
- index=param_options.index(existing_value) if existing_value in param_options else 0,
- key=f"gen_{i}_{param_name}"
- )
- else:
- param_value = st.text_input(
- param_name,
- value=str(existing_value) if existing_value is not None else "",
- key=f"gen_{i}_{param_name}"
- )
-
- params[param_name] = param_value
-
- # Add to list of generators
- generator_config = {"name": generator_name}
- if params:
- generator_config["params"] = params
-
- feature_generators.append(generator_config)
-
- return feature_generators
-
-
-def rescore_form(existing_rescore: Dict[str, Any] = None) -> Dict[str, Any]:
- """
- Create a form for rescoring settings.
-
- Args:
- existing_rescore: Existing rescore configuration
-
- Returns:
- Rescore configuration dictionary
- """
- if existing_rescore is None:
- existing_rescore = DEFAULT_CONFIG["rescore"]
-
- st.subheader("Rescoring Settings")
-
- rescore_model = st.selectbox(
- "Rescoring Model",
- options=["Percolator", "XGBoost", "RandomForest"],
- index=["Percolator", "XGBoost", "RandomForest"].index(existing_rescore.get("model", "Percolator")),
- help="Model to use for rescoring"
- )
-
- test_fdr = st.number_input(
- "Test FDR",
- min_value=0.001,
- max_value=0.1,
- value=float(existing_rescore.get("testFDR", 0.01)),
- step=0.001,
- format="%.3f",
- help="FDR threshold for testing"
- )
-
- num_jobs = st.number_input(
- "Number of Jobs",
- min_value=1,
- max_value=32,
- value=int(existing_rescore.get("numJobs", 1)),
- help="Number of parallel jobs for model training"
- )
-
- return {
- "model": rescore_model,
- "testFDR": test_fdr,
- "numJobs": num_jobs
- }
-
-
-def config_form(existing_config: Dict[str, Any] = None) -> Dict[str, Any]:
- """
- Create a form for configuring the pipeline.
-
- Args:
- existing_config: Existing configuration dictionary
-
- Returns:
- Configuration dictionary
- """
- if existing_config is None:
- existing_config = DEFAULT_CONFIG
-
- st.subheader("Basic Settings")
-
- experiment_name = st.text_input(
- "Experiment Name",
- value=existing_config.get("experimentName", ""),
- help="Name of the experiment"
- )
-
- input_type = st.selectbox(
- "Input Type",
- options=["pepxml", "pin"],
- index=["pepxml", "pin"].index(existing_config.get("inputType", "pepxml")),
- help="Type of input file"
- )
-
- # For GUI, we'll handle input files differently than the direct file paths
- input_files = existing_config.get("inputFile", [])
- if isinstance(input_files, str):
- input_files = [input_files]
-
- input_files_str = st.text_area(
- "Input Files",
- value="\n".join(input_files) if input_files else "",
- height=100,
- help="One file path per line. Use file uploader to add files."
- )
-
- input_files = [f for f in input_files_str.strip().split("\n") if f]
-
- decoy_prefix = st.text_input(
- "Decoy Prefix",
- value=existing_config.get("decoyPrefix", "DECOY_"),
- help="Prefix used to identify decoy sequences"
- )
-
- output_dir = st.text_input(
- "Output Directory",
- value=existing_config.get("outputDir", "./results"),
- help="Directory where results will be saved"
- )
-
- # Allele settings
- st.subheader("Allele Settings")
-
- alleles = existing_config.get("allele", [])
- if isinstance(alleles, str):
- alleles = [alleles]
-
- alleles_str = st.text_area(
- "Alleles",
- value="\n".join(alleles) if alleles else "",
- height=100,
- help="One allele per line, e.g., HLA-A*02:01"
- )
-
- alleles = [a for a in alleles_str.strip().split("\n") if a]
-
- # Performance settings
- st.subheader("Performance Settings")
-
- col1, col2 = st.columns(2)
-
- with col1:
- num_processes = st.number_input(
- "Number of Processes",
- min_value=1,
- max_value=64,
- value=int(existing_config.get("numProcesses", 4)),
- help="Number of parallel processes"
- )
-
- with col2:
- show_progress = st.checkbox(
- "Show Progress",
- value=existing_config.get("showProgress", True),
- help="Show progress bars during processing"
- )
-
- col1, col2 = st.columns(2)
-
- with col1:
- visualization = st.checkbox(
- "Enable Visualization",
- value=existing_config.get("visualization", True),
- help="Generate visualizations of results"
- )
-
- with col2:
- remove_pre_nxt_aa = st.checkbox(
- "Remove Pre/Next Amino Acids",
- value=existing_config.get("removePreNxtAA", False),
- help="Remove pre/post neighboring amino acids in sequence processing"
- )
-
- log_level = st.selectbox(
- "Log Level",
- options=["DEBUG", "INFO", "WARNING", "ERROR"],
- index=["DEBUG", "INFO", "WARNING", "ERROR"].index(existing_config.get("logLevel", "INFO")),
- help="Logging verbosity level"
- )
-
- # Advanced sections
-
- # Feature generators
- feature_generators = feature_generator_form(existing_config.get("featureGenerator", []))
-
- # Rescoring
- rescore = rescore_form(existing_config.get("rescore", {}))
-
- # Combine all settings
- config = {
- "experimentName": experiment_name,
- "inputType": input_type,
- "inputFile": input_files,
- "decoyPrefix": decoy_prefix,
- "outputDir": output_dir,
- "allele": alleles,
- "numProcesses": num_processes,
- "showProgress": show_progress,
- "visualization": visualization,
- "removePreNxtAA": remove_pre_nxt_aa,
- "logLevel": log_level,
- "featureGenerator": feature_generators,
- "rescore": rescore
- }
-
- return config
-
-
-def render_config_summary(config: Dict[str, Any]):
- """
- Render a summary of the configuration as a YAML code block.
-
- Args:
- config: Configuration dictionary
- """
- st.subheader("Configuration Summary")
-
- # Create a simplified copy of the configuration to display
- display_config = config.copy()
-
- # Convert to YAML string
- config_yaml = yaml.dump(display_config, default_flow_style=False, sort_keys=False)
-
- # Display as a code block with syntax highlighting
- st.code(config_yaml, language="yaml")
-
-
-def validate_config(config: Dict[str, Any]) -> List[str]:
- """
- Validate configuration for obvious errors.
-
- Args:
- config: Configuration dictionary
-
- Returns:
- List of error messages, empty if configuration is valid
- """
- errors = []
-
- # Check required fields
- required_fields = ["experimentName", "inputType", "inputFile", "outputDir", "allele"]
- for field in required_fields:
- if field not in config or not config[field]:
- errors.append(f"Missing required field: {field}")
-
- # Check inputType
- if config.get("inputType") not in ["pepxml", "pin"]:
- errors.append("Input type must be 'pepxml' or 'pin'")
-
- # Check feature generators
- generators = config.get("featureGenerator", [])
- if not generators:
- errors.append("At least one feature generator is required")
-
- # Check if SpectralSimilarity has required parameters
- for gen in generators:
- if gen.get("name") == "SpectralSimilarity":
- params = gen.get("params", {})
- # Check instrument
- instrument = params.get("instrument")
- if instrument and instrument not in ["QE", "LUMOS", "TIMSTOF", "SCIEXTOF"]:
- errors.append(f"Invalid instrument '{instrument}' for SpectralSimilarity. Must be one of: QE, LUMOS, TIMSTOF, SCIEXTOF")
-
- # Check mzML directory
- if "mzmlDir" not in params:
- errors.append("SpectralSimilarity requires 'mzmlDir' parameter")
-
- # Check spectrum ID pattern
- if "spectrumIdPattern" not in params:
- errors.append("SpectralSimilarity requires 'spectrumIdPattern' parameter to extract mzML filenames from spectrum IDs")
-
- # Check rescore settings
- rescore = config.get("rescore", {})
- if not rescore or "model" not in rescore:
- errors.append("Rescore model is required")
-
- if "testFDR" in rescore and (rescore["testFDR"] <= 0 or rescore["testFDR"] > 1):
- errors.append("Test FDR must be between 0 and 1")
-
- # Check modification map format
- mod_map = config.get("modificationMap", {})
- for mass, unimod in mod_map.items():
- if not unimod.startswith("UNIMOD:"):
- errors.append(f"Invalid UNIMOD format for mass {mass}: {unimod}. Must start with 'UNIMOD:'")
-
- return errors
diff --git a/optimhc/gui/components/feature_generator_form.py b/optimhc/gui/components/feature_generator_form.py
deleted file mode 100644
index 44e7e1d..0000000
--- a/optimhc/gui/components/feature_generator_form.py
+++ /dev/null
@@ -1,267 +0,0 @@
-"""
-Feature generator form component for optiMHC GUI.
-"""
-
-import streamlit as st
-from typing import Dict, Any, List
-
-def feature_generator_form(existing_generators: List[Dict[str, Any]] = None) -> List[Dict[str, Any]]:
- """
- Create a form for configuring feature generators.
-
- Args:
- existing_generators: List of existing feature generator configurations
-
- Returns:
- List of feature generator configurations
- """
- if existing_generators is None:
- existing_generators = []
-
- # Convert existing generators to a dict for easier lookup
- existing_gen_dict = {}
- for gen in existing_generators:
- existing_gen_dict[gen["name"]] = gen.get("params", {})
-
- feature_generators = []
-
- st.subheader("Feature Generators")
-
- # Determine the MHC class from existing configuration
- # Look for PWM first as it has explicit class parameter
- mhc_class = None
- for gen in existing_generators:
- if gen["name"] == "PWM" and "params" in gen and "class" in gen["params"]:
- mhc_class = gen["params"]["class"]
- break
-
- # If PWM not found, infer from presence of NetMHCIIpan
- if mhc_class is None:
- if any(gen["name"] == "NetMHCIIpan" for gen in existing_generators):
- mhc_class = "II"
- else:
- mhc_class = "I" # Default to class I
-
- # MHC class selection
- mhc_class = st.radio(
- "MHC Class",
- options=["I", "II"],
- index=0 if mhc_class == "I" else 1,
- horizontal=True,
- help="Select MHC class for appropriate feature generators."
- )
-
- st.markdown("---")
- st.markdown("Select which feature generators to use in the pipeline:")
-
- # Basic feature generator (always available)
- if st.checkbox("Basic", value="Basic" in existing_gen_dict or not existing_generators, key="basic_gen"):
- feature_generators.append({"name": "Basic"})
-
- # PWM feature generator (class parameter set automatically based on MHC class selection)
- if st.checkbox("PWM", value="PWM" in existing_gen_dict, key="pwm_gen"):
- feature_generators.append({
- "name": "PWM",
- "params": {"class": mhc_class}
- })
-
- # Class I specific generators
- if mhc_class == "I":
- # MHCflurry (class I only)
- if st.checkbox("MHCflurry", value="MHCflurry" in existing_gen_dict, key="mhcflurry_gen"):
- feature_generators.append({"name": "MHCflurry"})
-
- # NetMHCpan (class I only)
- if st.checkbox("NetMHCpan", value="NetMHCpan" in existing_gen_dict, key="netmhcpan_gen"):
- feature_generators.append({"name": "NetMHCpan"})
-
- # Class II specific generators
- else: # mhc_class == "II"
- # NetMHCIIpan (class II only)
- if st.checkbox("NetMHCIIpan", value="NetMHCIIpan" in existing_gen_dict, key="netmhciipan_gen"):
- feature_generators.append({"name": "NetMHCIIpan"})
-
- # DeepLC feature generator (available for both classes)
- if st.checkbox("DeepLC", value="DeepLC" in existing_gen_dict, key="deeplc_gen"):
- deeplc_params = {}
-
- col1, col2 = st.columns(2)
- with col1:
- calibration_criteria = st.text_input(
- "Calibration Criteria",
- value=existing_gen_dict.get("DeepLC", {}).get("calibrationCriteria", "expect"),
- key="deeplc_calibration_criteria",
- help="Criteria for calibration (e.g., expect, xcorr, hyperscore)"
- )
- deeplc_params["calibrationCriteria"] = calibration_criteria
-
- with col2:
- lower_is_better = st.checkbox(
- "Lower Is Better",
- value=existing_gen_dict.get("DeepLC", {}).get("lowerIsBetter", True),
- key="deeplc_lower_is_better",
- help="Whether lower values of the calibration criteria are better (True for expect, False for xcorr/hyperscore)"
- )
- deeplc_params["lowerIsBetter"] = lower_is_better
-
- calibration_size = st.slider(
- "Calibration Size",
- min_value=0.05,
- max_value=0.5,
- value=float(existing_gen_dict.get("DeepLC", {}).get("calibrationSize", 0.1)),
- step=0.05,
- key="deeplc_calibration_size",
- help="Fraction of PSMs to use for calibration (0.05-0.5)"
- )
- deeplc_params["calibrationSize"] = calibration_size
-
- feature_generators.append({"name": "DeepLC", "params": deeplc_params})
-
- # SpectralSimilarity feature generator (with AlphaPeptDeep as default)
- if st.checkbox("SpectralSimilarity", value="SpectralSimilarity" in existing_gen_dict or not existing_generators, key="spectra_similarity_gen"):
- ss_params = {}
-
- st.markdown("#### SpectralSimilarity Settings")
-
- model = st.selectbox(
- "Model",
- options=["AlphaPeptDeep_ms2_generic"],
- index=["AlphaPeptDeep_ms2_generic"].index(
- existing_gen_dict.get("SpectralSimilarity", {}).get("model", "AlphaPeptDeep_ms2_generic")
- ),
- key="spectra_similarity_model",
- help="Prediction model for theoretical spectra"
- )
- ss_params["model"] = model
-
- instrument = st.selectbox(
- "Instrument",
- options=["QE", "LUMOS", "TIMSTOF", "SCIEXTOF"],
- index=["QE", "LUMOS", "TIMSTOF", "SCIEXTOF"].index(
- existing_gen_dict.get("SpectralSimilarity", {}).get("instrument", "LUMOS")
- ),
- key="spectra_similarity_instrument",
- help="Available instruments: QE, LUMOS, TIMSTOF, SCIEXTOF"
- )
- ss_params["instrument"] = instrument
-
- # mzML directory path
- mzml_dir = st.text_input(
- "mzML Directory Path",
- value=existing_gen_dict.get("SpectralSimilarity", {}).get("mzmlDir", "./data"),
- key="spectra_similarity_mzml_dir",
- help="Path to directory containing mzML files"
- )
- if mzml_dir:
- ss_params["mzmlDir"] = mzml_dir
-
- # Spectrum ID pattern
- spectrum_id_pattern = st.text_input(
- "Spectrum ID Pattern",
- value=existing_gen_dict.get("SpectralSimilarity", {}).get("spectrumIdPattern", "(.+?)\\.\\d+\\.\\d+\\.\\d+"),
- key="spectra_similarity_spectrum_id_pattern",
- help="Regular expression pattern to extract mzML filename from spectrum IDs. Default pattern: (.+?)\\.\\d+\\.\\d+\\.\\d+"
- )
- if spectrum_id_pattern:
- ss_params["spectrumIdPattern"] = spectrum_id_pattern
-
- collision_energy = st.number_input(
- "Collision Energy",
- min_value=20,
- max_value=40,
- value=int(existing_gen_dict.get("SpectralSimilarity", {}).get("collisionEnergy", 28)),
- key="spectra_similarity_collision_energy",
- help="Collision energy used during acquisition (typical range: 25-30)"
- )
- ss_params["collisionEnergy"] = collision_energy
-
- tolerance = st.slider(
- "Tolerance (ppm)",
- min_value=10,
- max_value=50,
- value=int(existing_gen_dict.get("SpectralSimilarity", {}).get("tolerance", 20)),
- step=5,
- key="spectra_similarity_tolerance",
- help="Mass tolerance in ppm for peak matching (10-50 ppm)"
- )
- ss_params["tolerance"] = tolerance
-
- num_top_peaks = st.slider(
- "Number of Top Peaks",
- min_value=10,
- max_value=100,
- value=int(existing_gen_dict.get("SpectralSimilarity", {}).get("numTopPeaks", 36)),
- step=2,
- key="spectra_similarity_num_top_peaks",
- help="Number of most intense peaks to consider for matching"
- )
- ss_params["numTopPeaks"] = num_top_peaks
-
- url = st.text_input(
- "API URL",
- value=existing_gen_dict.get("SpectralSimilarity", {}).get("url", "koina.wilhelmlab.org:443"),
- key="spectra_similarity_url",
- help="AlphaPept API URL (default: koina.wilhelmlab.org:443)"
- )
- if url:
- ss_params["url"] = url
-
- feature_generators.append({"name": "SpectralSimilarity", "params": ss_params})
-
- # OverlappingPeptide feature generator
- if st.checkbox("OverlappingPeptide", value="OverlappingPeptide" in existing_gen_dict, key="overlapping_peptide_gen"):
- op_params = {}
-
- st.markdown("#### OverlappingPeptide Settings")
-
- col1, col2 = st.columns(2)
- with col1:
- min_overlap_length = st.number_input(
- "Min Overlap Length",
- min_value=5,
- max_value=15,
- value=int(existing_gen_dict.get("OverlappingPeptide", {}).get("minOverlapLength", 7)),
- key="op_min_overlap_length",
- help="Minimum number of amino acids that must overlap"
- )
- op_params["minOverlapLength"] = min_overlap_length
-
- with col2:
- overlapping_score = st.text_input(
- "Overlapping Score",
- value=existing_gen_dict.get("OverlappingPeptide", {}).get("overlappingScore", "expect"),
- key="op_overlapping_score",
- help="Score to use for overlapping peptides (e.g., expect, xcorr, hyperscore)"
- )
- op_params["overlappingScore"] = overlapping_score
-
- col1, col2 = st.columns(2)
- with col1:
- min_length = st.number_input(
- "Min Length",
- min_value=5,
- max_value=15,
- value=int(existing_gen_dict.get("OverlappingPeptide", {}).get("minLength", 7 if mhc_class == "I" else 9)),
- key="op_min_length",
- help="Minimum peptide length to consider"
- )
- op_params["minLength"] = min_length
-
- with col2:
- max_length = st.number_input(
- "Max Length",
- min_value=10,
- max_value=50,
- value=int(existing_gen_dict.get("OverlappingPeptide", {}).get("maxLength", 20 if mhc_class == "I" else 30)),
- key="op_max_length",
- help="Maximum peptide length to consider"
- )
- op_params["maxLength"] = max_length
-
- feature_generators.append({"name": "OverlappingPeptide", "params": op_params})
-
- # Warning if no generators selected
- if not feature_generators:
- st.warning("Please select at least one feature generator.")
-
- return feature_generators
diff --git a/optimhc/gui/components/file_upload.py b/optimhc/gui/components/file_upload.py
deleted file mode 100644
index f830666..0000000
--- a/optimhc/gui/components/file_upload.py
+++ /dev/null
@@ -1,159 +0,0 @@
-"""
-File upload component for optiMHC GUI.
-"""
-
-import os
-import streamlit as st
-import yaml
-import tempfile
-from typing import Dict, Any, Optional, Tuple
-from optimhc.gui.utils import load_config_from_yaml
-
-
-def config_file_uploader() -> Optional[Dict[str, Any]]:
- """
- Display a file uploader for configuration files.
-
- Returns:
- Configuration dictionary if a file is uploaded, None otherwise
- """
- uploaded_file = st.file_uploader(
- "Upload configuration file",
- type=["yaml", "yml"],
- help="Upload a YAML configuration file"
- )
-
- if uploaded_file is not None:
- try:
- config = yaml.safe_load(uploaded_file)
- st.success(f"Configuration file '{uploaded_file.name}' loaded successfully")
- return config
- except Exception as e:
- st.error(f"Error loading configuration file: {str(e)}")
- return None
-
- return None
-
-
-def input_path_field(input_type: str, value: str = "", placeholder: str = "") -> str:
- """
- Display an input field for file paths.
-
- Args:
- input_type: Type of input (pepxml, pin, mzML directory)
- value: Current value
- placeholder: Placeholder text
-
- Returns:
- String containing file paths, one per line
- """
- if input_type.lower() in ["pepxml", "pin"]:
- help_text = f"Enter the full path to your {input_type} files, one per line"
- elif input_type.lower() == "mzml":
- help_text = "Enter the full path to your mzML directory"
- else:
- help_text = "Enter file paths, one per line"
-
- paths = st.text_area(
- f"{input_type} File Paths",
- value=value,
- placeholder=placeholder,
- height=100,
- help=help_text,
- key=f"{input_type.lower()}_paths"
- )
-
- return paths
-
-
-def yaml_example(example_type: str = "class_i") -> str:
- """
- Return an example YAML configuration by reading from example files.
-
- Args:
- example_type: Type of example (class_i or class_ii)
-
- Returns:
- Example YAML configuration as a string
- """
- # Get the path to the examples directory
- current_dir = os.path.dirname(
- os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
- )
- examples_dir = os.path.join(current_dir, "examples")
-
- # Determine which example file to read
- if example_type.lower() == "class_i":
- example_file = os.path.join(examples_dir, "classI_example.yaml")
- default_config = """
-experimentName: class_I_example
-inputType: pepxml
-inputFile:
- - ./data/YE_20180428_SK_HLA_A0202_3Ips_a50mio_R1_01.pep.xml
-decoyPrefix: DECOY_
-outputDir: ./examples/results/class_I_example
-visualization: True
-removePreNxtAA: False
-numProcesses: 4
-showProgress: True
-modificationMap:
- "147.035385": "UNIMOD:35" # Oxidation (M)
- "160.030649": "UNIMOD:4" # Carbamidomethyl (C)
-allele:
- - HLA-A*02:02
-featureGenerator:
- - name: Basic
- - name: PWM
- params:
- class: I
- - name: MHCflurry
- - name: NetMHCpan
-rescore:
- testFDR: 0.01
- model: Percolator
- numJobs: 4
-"""
- else: # class_ii
- example_file = os.path.join(examples_dir, "classII_example.yaml")
- default_config = """
-experimentName: class_II_example
-inputType: pepxml
-inputFile:
- - ./data/AG20201214_FAIMS_DPB0101_DPA0201_93e6_1hr.pep.xml
-decoyPrefix: DECOY_
-outputDir: ./examples/results/class_II_example
-visualization: True
-removePreNxtAA: False
-numProcesses: 4
-showProgress: True
-modificationMap:
- "147.035385": "UNIMOD:35" # Oxidation (M)
- "160.030649": "UNIMOD:4" # Carbamidomethyl (C)
-allele:
- - HLA-DPA1*02:01-DPB1*01:01
-featureGenerator:
- - name: Basic
- - name: PWM
- params:
- class: II
- - name: NetMHCIIpan
-rescore:
- testFDR: 0.01
- model: Percolator
- numJobs: 4
-"""
-
- try:
- if not os.path.exists(example_file):
- st.warning(f"Example file not found: {example_file}, using default configuration")
- return default_config
-
- with open(example_file, 'r') as f:
- content = f.read()
- if not content:
- st.warning(f"Example file is empty: {example_file}, using default configuration")
- return default_config
- return content
- except Exception as e:
- st.warning(f"Error reading example file: {str(e)}, using default configuration")
- return default_config
diff --git a/optimhc/gui/components/log_viewer.py b/optimhc/gui/components/log_viewer.py
deleted file mode 100644
index c1719ce..0000000
--- a/optimhc/gui/components/log_viewer.py
+++ /dev/null
@@ -1,250 +0,0 @@
-"""
-Log viewer component for optiMHC GUI.
-"""
-
-import os
-from typing import List, Optional
-import streamlit as st
-
-
-def display_logs(logs: List[str]):
- """
- Display logs as read-only text with auto-scrolling.
-
- Args:
- logs: List of log messages
- """
- if not logs:
- st.info("No logs to display yet...")
- return
-
- # Join logs with newlines
- log_text = "\n".join(logs)
-
- # Use a container with custom CSS to create a taller scrollable area
- log_container = st.container()
-
- with log_container:
- # Add custom CSS for taller log area with scrollbar
- st.markdown("""
-
- """, unsafe_allow_html=True)
-
- # Use st.code for read-only display
- st.code(log_text, language="plain")
-
- # Add JavaScript to auto-scroll to bottom
- # This ensures the latest logs are always visible
- auto_scroll_js = """
-
- """
- st.components.v1.html(auto_scroll_js, height=0)
-
-
-def find_pipeline_log_file() -> Optional[str]:
- """
- Find the pipeline log file based on configuration.
-
- Returns:
- Path to the log file or None if not found
- """
- # Try to get output directory and experiment name from config
- output_dir = None
- experiment_name = None
-
- if "config" in st.session_state:
- config = st.session_state.config
- output_dir = config.get("outputDir")
- experiment_name = config.get("experimentName")
-
- # First, check if we already know the log file path from pipeline execution
- if "pipeline_log_path" in st.session_state and st.session_state.pipeline_log_path:
- log_path = st.session_state.pipeline_log_path
- if os.path.exists(log_path):
- return log_path
-
- # Next, try to find log file in the expected pipeline output directory
- if output_dir and experiment_name:
- experiment_dir = os.path.join(output_dir, experiment_name)
- if os.path.exists(experiment_dir):
- # Pipeline's default log file
- log_path = os.path.join(experiment_dir, "log")
- if os.path.exists(log_path):
- return log_path
-
- # Look for any log files in the experiment directory
- for root, _, files in os.walk(experiment_dir):
- for file in files:
- if file.endswith(".log") or file == "log":
- return os.path.join(root, file)
-
- # If no log in experiment directory, search the main output directory
- if output_dir and os.path.exists(output_dir):
- log_files = []
- for root, _, files in os.walk(output_dir):
- log_files.extend([os.path.join(root, f) for f in files
- if f.endswith(".log") or f == "log"])
-
- if log_files:
- # Return the most recently modified log file
- return max(log_files, key=os.path.getmtime)
-
- return None
-
-
-def read_log_file(log_path, max_lines=1000):
- """
- Read log content from file.
-
- Args:
- log_path: Path to the log file
- max_lines: Maximum number of lines to read
-
- Returns:
- List of log lines
- """
- try:
- if not os.path.exists(log_path):
- return []
-
- with open(log_path, 'r', encoding='utf-8') as f:
- lines = f.readlines()
- # Return the last max_lines lines
- return [line.rstrip() for line in lines[-max_lines:]]
- except Exception as e:
- print(f"Error reading log file: {str(e)}")
- return []
-
-
-def update_logs():
- """
- Update logs from the log file.
-
- Returns:
- True if logs were updated, False otherwise
- """
- # Find the log file
- log_path = find_pipeline_log_file()
- if not log_path:
- return False
-
- # Read the log file
- logs = read_log_file(log_path)
- if not logs:
- return False
-
- # Update session state
- st.session_state.logs = logs
- return True
-
-
-def log_viewer(process=None):
- """
- Simple log viewer with manual refresh button.
-
- Args:
- process: Optional subprocess to monitor for status
- """
- st.subheader("Log Output")
-
- # Status indicator if process is provided
- if process:
- if process.poll() is None:
- st.caption("📋 Process is running...")
- else:
- ret_code = process.poll()
- if ret_code == 0:
- st.caption("✅ Process completed successfully. Return code: 0")
- else:
- st.caption(f"⚠️ Process completed with errors. Return code: {ret_code}")
-
- # Find log file
- log_path = find_pipeline_log_file()
-
- # Controls row
- col1, col2, col3 = st.columns([2, 1, 1])
-
- with col1:
- if log_path:
- st.caption(f"Log file: {log_path}")
- else:
- st.caption("No log file found")
-
- with col2:
- # Clear logs button
- if st.button("Clear Logs"):
- st.session_state.logs = []
- st.rerun()
-
- with col3:
- # Refresh button
- if st.button("Refresh Logs"):
- update_logs()
- st.rerun()
-
- # Debug info (collapsed)
- with st.expander("Debug Info", expanded=False):
- log_path = find_pipeline_log_file() or "Not found"
- log_exists = "Yes" if log_path != "Not found" and os.path.exists(log_path) else "No"
- log_size = "0" if log_path == "Not found" or not os.path.exists(log_path) else str(os.path.getsize(log_path))
-
- process_info = ""
- if process:
- process_info = f"""
-Process PID: {process.pid}
-Process Return Code: {process.poll()}
-Has stdout: {"Yes" if hasattr(process, 'stdout') and process.stdout else "No"}
-"""
-
- st.code(f"""{process_info}
-Log Count: {len(st.session_state.logs) if "logs" in st.session_state else 0}
-Log File: {log_path}
-Log File Exists: {log_exists}
-Log File Size: {log_size} bytes
- """)
-
- # Initialize logs if needed
- if "logs" not in st.session_state:
- st.session_state.logs = []
- # Try to load logs the first time
- update_logs()
-
- # Display the logs
- display_logs(st.session_state.logs)
diff --git a/optimhc/gui/components/modification_map.py b/optimhc/gui/components/modification_map.py
deleted file mode 100644
index 6a61af0..0000000
--- a/optimhc/gui/components/modification_map.py
+++ /dev/null
@@ -1,102 +0,0 @@
-"""
-Modification map component for optiMHC GUI.
-"""
-
-import streamlit as st
-from typing import Dict, Any, Optional
-
-def modification_map_form(existing_map: Optional[Dict[str, str]] = None) -> Dict[str, str]:
- """
- Create a form for modification map configuration.
-
- Args:
- existing_map: Existing modification map configuration
-
- Returns:
- Modification map dictionary mapping masses to UNIMOD values
- """
- if existing_map is None:
- existing_map = {
- "147.035385": "UNIMOD:35", # Oxidation (M) - Full modified residue mass
- "160.030649": "UNIMOD:4", # Carbamidomethyl (C) - Full modified residue mass
- "166.998359": "UNIMOD:21" # Phospho (S) - Full modified residue mass
- }
-
- st.subheader("Modification Map")
-
- st.markdown("""
- Specify the mapping from modification masses to UNIMOD identifiers.
- The mass value should be the FULL modified residue mass (amino acid + modification) as found in pepXML parameters.
- All modifications need to be explicitly encoded in the sequence (e.g., C[UNIMOD:4] for carbamidomethylated cysteine).
- """)
-
- # Create a container for the dynamic map
- modification_map = {}
-
- # Use session state to track number of modification entries
- if "num_modifications" not in st.session_state:
- st.session_state.num_modifications = len(existing_map)
- st.session_state.modification_masses = list(existing_map.keys())
- st.session_state.modification_values = list(existing_map.values())
-
- # Add/remove modification controls
- col1, col2 = st.columns([1, 5])
- with col1:
- if st.button("➕ Add Modification", key="add_modification"):
- st.session_state.num_modifications += 1
- st.session_state.modification_masses.append("")
- st.session_state.modification_values.append("UNIMOD:")
- st.rerun()
- with col2:
- if st.session_state.num_modifications > 0 and st.button("➖ Remove Last Modification", key="remove_modification"):
- st.session_state.num_modifications -= 1
- if st.session_state.modification_masses:
- st.session_state.modification_masses.pop()
- if st.session_state.modification_values:
- st.session_state.modification_values.pop()
- st.rerun()
-
- # Create a table-like interface for modifications
- if st.session_state.num_modifications > 0:
- col1, col2 = st.columns(2)
- with col1:
- st.markdown("**Mass (Residue+Modification)**")
- with col2:
- st.markdown("**UNIMOD Identifier**")
-
- for i in range(st.session_state.num_modifications):
- col1, col2 = st.columns(2)
- with col1:
- mass = st.text_input(
- "Mass",
- value=st.session_state.modification_masses[i] if i < len(st.session_state.modification_masses) else "",
- key=f"mod_mass_{i}",
- label_visibility="collapsed"
- )
- st.session_state.modification_masses[i] = mass
-
- with col2:
- unimod = st.text_input(
- "UNIMOD",
- value=st.session_state.modification_values[i] if i < len(st.session_state.modification_values) else "UNIMOD:",
- key=f"mod_unimod_{i}",
- label_visibility="collapsed"
- )
- st.session_state.modification_values[i] = unimod
-
- # Add to modification map
- if mass and unimod:
- modification_map[mass] = unimod
-
- # Information about common modifications
- with st.expander("Common Modifications (Note: Values are examples, check your pepXML)", expanded=False):
- st.markdown("""
- | Mass (Full) | UNIMOD ID | Modification | Target Residues |
- |------|-----------|--------------|--------------|
- | 147.035385 | UNIMOD:35 | Oxidation | M |
- | 160.030649 | UNIMOD:4 | Carbamidomethyl | C |
-
- Note: These are full masses (amino acid + modification). You must check your pepXML file parameters to find the exact masses used in your data.
- """)
-
- return modification_map
diff --git a/optimhc/gui/components/pipeline_control.py b/optimhc/gui/components/pipeline_control.py
deleted file mode 100644
index 8dc4726..0000000
--- a/optimhc/gui/components/pipeline_control.py
+++ /dev/null
@@ -1,177 +0,0 @@
-"""
-Pipeline control component for optiMHC GUI.
-"""
-
-import os
-import subprocess
-import sys
-import tempfile
-import time
-from typing import Dict, Any, Optional, Tuple
-import streamlit as st
-import yaml
-
-from optimhc.gui.utils import create_temp_config_file, run_pipeline_command
-
-
-def pipeline_status_indicator(running: bool = False, success: Optional[bool] = None):
- """
- Display a status indicator for the pipeline.
-
- Args:
- running: Whether the pipeline is currently running
- success: Whether the pipeline completed successfully
- """
- if running:
- st.info("Pipeline is running...")
- elif success is not None:
- if success:
- st.success("Pipeline completed successfully")
- else:
- st.error("Pipeline failed")
- else:
- st.info("Pipeline not yet started")
-
-
-def pipeline_control_panel(config: Dict[str, Any]):
- """
- Create a control panel for running the pipeline.
-
- Args:
- config: Configuration dictionary
- """
- st.subheader("Pipeline Control")
-
- # Initialize session state
- if "pipeline_running" not in st.session_state:
- st.session_state.pipeline_running = False
-
- if "pipeline_process" not in st.session_state:
- st.session_state.pipeline_process = None
-
- if "pipeline_start_time" not in st.session_state:
- st.session_state.pipeline_start_time = None
-
- if "pipeline_config_path" not in st.session_state:
- st.session_state.pipeline_config_path = None
-
- # Display status
- col1, col2 = st.columns([1, 3])
-
- with col1:
- if st.session_state.pipeline_running:
- pipeline_status_indicator(running=True)
- else:
- if st.session_state.pipeline_process is not None:
- return_code = st.session_state.pipeline_process.poll()
- pipeline_status_indicator(success=(return_code == 0))
- else:
- pipeline_status_indicator()
-
- with col2:
- if st.session_state.pipeline_start_time:
- elapsed_time = time.time() - st.session_state.pipeline_start_time
- st.text(f"Running for: {int(elapsed_time // 60)}m {int(elapsed_time % 60)}s")
-
- # Control buttons
- start_disabled = st.session_state.pipeline_running
- stop_disabled = not st.session_state.pipeline_running
-
- col1, col2 = st.columns(2)
-
- with col1:
- if st.button("Start Pipeline", disabled=start_disabled, key="start_pipeline"):
- # Check if the configuration is valid
- if not config.get("experimentName"):
- st.error("Experiment name is required")
- return
-
- if not config.get("inputFile"):
- st.error("At least one input file is required")
- return
-
- if not config.get("allele"):
- st.error("At least one allele is required")
- return
-
- if not config.get("featureGenerator"):
- st.error("At least one feature generator is required")
- return
-
- # Create a temporary configuration file
- config_path = create_temp_config_file(config)
- st.session_state.pipeline_config_path = config_path
-
- # Run the pipeline as a subprocess
- st.session_state.pipeline_process = run_pipeline_command(config_path)
- st.session_state.pipeline_running = True
- st.session_state.pipeline_start_time = time.time()
-
- # Initialize logs
- if "logs" not in st.session_state:
- st.session_state.logs = []
-
- # Rerun to update UI
- st.rerun()
-
- with col2:
- if st.button("Stop Pipeline", disabled=stop_disabled, key="stop_pipeline"):
- if st.session_state.pipeline_process:
- # Terminate the process
- st.session_state.pipeline_process.terminate()
- st.session_state.pipeline_running = False
-
- # Wait for process to terminate
- try:
- st.session_state.pipeline_process.wait(timeout=5)
- except subprocess.TimeoutExpired:
- # Force kill if it doesn't terminate gracefully
- st.session_state.pipeline_process.kill()
-
- st.warning("Pipeline was stopped by user")
-
- # No longer need to cleanup config file since it's part of the output
-
- # Rerun to update UI
- st.rerun()
-
- # Save configuration button
- if st.button("Save Configuration to File"):
- # Create a download button for the configuration
- config_yaml = yaml.dump(config, default_flow_style=False)
-
- # Use streamlit's download button
- filename = f"{config.get('experimentName', 'optimhc_config')}.yaml"
- st.download_button(
- label="Download Configuration File",
- data=config_yaml,
- file_name=filename,
- mime="text/yaml"
- )
-
-
-def check_pipeline_status():
- """
- Check the status of a running pipeline.
-
- Returns:
- Tuple of (running, return_code)
- """
- running = st.session_state.get("pipeline_running", False)
- process = st.session_state.get("pipeline_process", None)
-
- if process is None:
- return False, None
-
- # Check if process is still running
- return_code = process.poll()
-
- if return_code is not None and running:
- # Process has completed
- st.session_state.pipeline_running = False
-
- # No longer need to cleanup config file since it's part of the output
-
- return False, return_code
-
- return running, return_code
diff --git a/optimhc/gui/components/rescore_form.py b/optimhc/gui/components/rescore_form.py
deleted file mode 100644
index e30b540..0000000
--- a/optimhc/gui/components/rescore_form.py
+++ /dev/null
@@ -1,55 +0,0 @@
-"""
-Rescoring form component for optiMHC GUI.
-"""
-
-import streamlit as st
-from typing import Dict, Any
-
-# Import optiMHC config defaults
-from optimhc.core.config import DEFAULT_CONFIG
-
-def rescore_form(existing_rescore: Dict[str, Any] = None) -> Dict[str, Any]:
- """
- Create a form for rescoring settings.
-
- Args:
- existing_rescore: Existing rescore configuration
-
- Returns:
- Rescore configuration dictionary
- """
- if existing_rescore is None:
- existing_rescore = DEFAULT_CONFIG["rescore"]
-
- st.subheader("Rescoring Settings")
-
- rescore_model = st.selectbox(
- "Rescoring Model",
- options=["Percolator", "XGBoost", "RandomForest"],
- index=["Percolator", "XGBoost", "RandomForest"].index(existing_rescore.get("model", "Percolator")),
- help="Model to use for rescoring"
- )
-
- test_fdr = st.number_input(
- "Test FDR",
- min_value=0.001,
- max_value=0.1,
- value=float(existing_rescore.get("testFDR", 0.01)),
- step=0.001,
- format="%.3f",
- help="FDR threshold for testing"
- )
-
- num_jobs = st.number_input(
- "Number of Jobs",
- min_value=1,
- max_value=32,
- value=int(existing_rescore.get("numJobs", 1)),
- help="Number of parallel jobs for model training"
- )
-
- return {
- "model": rescore_model,
- "testFDR": test_fdr,
- "numJobs": num_jobs
- }
diff --git a/optimhc/gui/components/results_viewer.py b/optimhc/gui/components/results_viewer.py
deleted file mode 100644
index 05be9e7..0000000
--- a/optimhc/gui/components/results_viewer.py
+++ /dev/null
@@ -1,248 +0,0 @@
-"""
-Results viewer component for optiMHC GUI.
-"""
-
-import os
-import glob
-import base64
-from typing import List, Dict, Any, Optional
-import streamlit as st
-import pandas as pd
-import plotly.express as px
-from pathlib import Path
-
-from optimhc.gui.utils import scan_output_directory
-
-
-def get_image_as_base64(file_path):
- """
- Get image file as base64 string.
-
- Args:
- file_path: Path to image file
-
- Returns:
- Base64 encoded image
- """
- with open(file_path, "rb") as image_file:
- return base64.b64encode(image_file.read()).decode()
-
-
-def display_image(file_path, caption=None):
- """
- Display an image with caption.
-
- Args:
- file_path: Path to image file
- caption: Optional caption for the image
- """
- try:
- # Use HTML to have more control over image sizing
- img_format = file_path.split('.')[-1].lower()
- img_base64 = get_image_as_base64(file_path)
- html = f''
-
- if caption:
- html = f'{html}