-"""
-pipeline.py
-
-Implements the main optiMHC pipeline for immunopeptidomics rescoring, including input parsing,
-feature generation, rescoring, result saving, and visualization. Supports both single-run and
-experiment modes.
-"""
-importos
-importlogging
-importgc
-importpandasaspd
-frommultiprocessingimportProcess
-frommokapot.modelimportPercolatorModel
-
-fromoptimhc.parserimportread_pin,read_pepxml
-fromoptimhc.rescoreimportmokapot
-fromoptimhc.rescore.modelimportXGBoostPercolatorModel,RandomForestPercolatorModel
-fromoptimhc.visualizationimport(
- plot_feature_importance,
- visualize_target_decoy_features,
- visualize_feature_correlation,
- plot_qvalues,
-)
-fromoptimhc.core.configimportload_config,Config
-fromoptimhc.core.logging_helperimportsetup_loggers
-fromoptimhc.core.feature_generationimportgenerate_features
-
-logger=logging.getLogger(__name__)
-
-
-[docs]
-classPipeline:
-"""
- Main pipeline class for optiMHC, encapsulating the full data processing workflow.
-
- This class orchestrates input parsing, feature generation, rescoring, result saving, and visualization.
- It supports both single-run and experiment modes (multiple feature/model combinations).
-
- Parameters
- ----------
- config : str, dict, or Config
- Path to YAML config, dict, or Config object.
-
- Examples
- --------
- >>> from optimhc.core import Pipeline
- >>> pipeline = Pipeline(config)
- >>> pipeline.run()
- """
-
-[docs]
-classDeepLCFeatureGenerator(BaseFeatureGenerator):
-"""
- Generate DeepLC-based features for rescoring.
-
- This generator uses DeepLC to predict retention times and calculates various
- features based on the differences between predicted and observed retention times.
-
- Parameters
- ----------
- psms : PsmContainer
- PSMs to generate features for.
- calibration_criteria_column : str
- Column name in the PSMs DataFrame to use for DeepLC calibration.
- lower_score_is_better : bool, optional
- Whether a lower PSM score denotes a better matching PSM. Default is False.
- calibration_set_size : int or float, optional
- Amount of best PSMs to use for DeepLC calibration. If this value is lower
- than the number of available PSMs, all PSMs will be used. Default is 0.15.
- processes : int, optional
- Number of processes to use in DeepLC. Default is 1.
- model_path : str, optional
- Path to the DeepLC model. If None, the default model will be used.
- remove_pre_nxt_aa : bool, optional
- Whether to remove the first and last amino acids from the peptide sequence.
- Default is True.
- mod_dict : dict, optional
- Dictionary of modifications to be used for DeepLC. If None, no modifications
- will be used.
-
- Notes
- -----
- DeepLC retraining is on by default. Add ``deeplc_retrain: False`` as a keyword
- argument to disable retraining.
-
- The generated features include:
- - observed_retention_time: Original retention time from the data
- - predicted_retention_time: DeepLC predicted retention time
- - retention_time_diff: Difference between predicted and observed times
- - abs_retention_time_diff: Absolute difference between predicted and observed times
- - retention_time_ratio: Ratio of min(pred,obs) to max(pred,obs)
- """
-
-
-[docs]
- def__init__(
- self,
- psms:PsmContainer,
- calibration_criteria_column:str,
- lower_score_is_better:bool=False,
- calibration_set_size:Union[int,float,None]=None,
- processes:int=1,
- model_path:Optional[str]=None,
- remove_pre_nxt_aa:bool=True,
- mod_dict:Optional[Dict[str,str]]=None,
- *args,
- **kwargs,
- ):
-"""
- Generate DeepLC-based features for rescoring.
-
- DeepLC retraining is on by default. Add ``deeplc_retrain: False`` as a keyword argument to
- disable retraining.
-
- Parameters:
- psms: PsmContainer
- PSMs to generate features for.
- calibration_criteria_column: str
- Column name in the PSMs DataFrame to use for DeepLC calibration.
- lower_score_is_better
- Whether a lower PSM score denotes a better matching PSM. Default: False
- calibration_set_size: int or float
- Amount of best PSMs to use for DeepLC calibration. If this value is lower
- than the number of available PSMs, all PSMs will be used. (default: 0.15)
- processes: {int, None}
- Number of processes to use in DeepLC. Defaults to 1.
- model_path: str
- Path to the DeepLC model. If None, the default model will be used.
- remove_pre_nxt_aa: bool
- Whether to remove the first and last amino acids from the peptide sequence.
- Default: True
- mod_dict: dict
- Dictionary of modifications to be used for DeepLC. If None, no modifications will be used.
- *args: list
- Additional positional arguments are passed to DeepLC.
- kwargs: dict
- Additional keyword arguments are passed to DeepLC.
- """
- self.psms=psms
- self.lower_score_is_better=lower_score_is_better
- self.calibration_criteria_column=calibration_criteria_column
- self.calibration_set_size=calibration_set_size
- self.processes=processes
- self.model_path=model_path
- self.remove_pre_nxt_aa=remove_pre_nxt_aa
- self.mod_dict=mod_dict
- self.deeplc_df=self._get_deeplc_df()
- self.DeepLC=DeepLC
- self._raw_predictions=None
- ifmodel_pathisnotNone:
- self.deeplc_predictor=self.DeepLC(
- n_jobs=self.processes,
- path_model=model_path,
- )
- else:
- self.deeplc_predictor=self.DeepLC(n_jobs=self.processes)
- logger.info(
- f"Initialized DeepLCFeatureGenerator with {len(self.psms)} PSMs."
- f" Calibration criteria: {self.calibration_criteria_column}."
- f" Lower score is better: {self.lower_score_is_better}."
- f" Calibration set size: {self.calibration_set_size}."
- f" Processes: {self.processes}."
- f" Model path: {self.model_path}."
- )
-
-
- @property
- deffeature_columns(self)->List[str]:
-"""
- Return the list of generated feature column names.
-
- Returns
- -------
- List[str]
- List of feature column names:
- - observed_retention_time
- - predicted_retention_time
- - retention_time_diff
- - abs_retention_time_diff
- - retention_time_ratio
- """
- return[
- "observed_retention_time",
- "predicted_retention_time",
- "retention_time_diff",
- "abs_retention_time_diff",
- "retention_time_ratio",
- ]
-
- @property
- defid_column(self)->List[str]:
-"""
- Return the list of input columns required for the feature generator.
-
- Returns
- -------
- List[str]
- List of input columns required for feature generation.
- Currently returns an empty list as the required columns are
- handled internally by the PsmContainer.
- """
- return[""]
-
-
-[docs]
- def_get_deeplc_df(self):
-"""
- Extract the format required by DeepLC, while retaining necessary original information.
-
- Returns
- -------
- pd.DataFrame
- DataFrame with the required DeepLC format and original information:
- - original_seq: Original peptide sequence
- - label: Target/decoy label
- - seq: Cleaned peptide sequence
- - modifications: Unimod format modifications
- - tr: Retention time
- - score: Calibration criteria score
-
- Raises
- ------
- ValueError
- If retention time column is not found in the PSMs DataFrame.
-
- Notes
- -----
- This method prepares the data in the format required by DeepLC,
- including cleaning peptide sequences and converting modifications
- to Unimod format.
- """
- df_deeplc=pd.DataFrame()
- df_psm=self.psms.psms
- df_deeplc["original_seq"]=df_psm[self.psms.peptide_column]
- df_deeplc["label"]=df_psm[self.psms.label_column]
-
- ifself.remove_pre_nxt_aa:
- df_deeplc["seq"]=df_deeplc["original_seq"].apply(
- utils.remove_pre_and_nxt_aa
- )
- else:
- df_deeplc["seq"]=df_deeplc["original_seq"]
-
- # Apply extract_unimod_from_peptidoform once and store both results.
- ifself.mod_dictisNone:
- logger.warning("No mod_dict provided. Removing modifications.")
- df_deeplc["seq"]=df_deeplc["seq"].apply(
- lambdax:utils.remove_modifications(x)
- )
- df_deeplc["modifications"]=""
- else:
- extracted_results=df_deeplc["seq"].apply(
- lambdax:utils.extract_unimod_from_peptidoform(
- x,mod_dict=self.mod_dict
- )
- )
- df_deeplc["seq"]=extracted_results.apply(lambdax:x[0])
- df_deeplc["modifications"]=extracted_results.apply(lambdax:x[1])
-
- ifself.psms.retention_time_columnisNone:
- raiseValueError("DeepLC requires retention time values.")
-
- df_deeplc["tr"]=df_psm[self.psms.retention_time_column]
- df_deeplc["score"]=df_psm[self.calibration_criteria_column]
-
- logger.debug("DeepLC input DataFrame:")
- logger.debug(df_deeplc)
-
- returndf_deeplc
-
-
-
-[docs]
- defgenerate_features(self)->pd.DataFrame:
-"""
- Generate DeepLC features for the provided PSMs.
-
- Returns
- -------
- pd.DataFrame
- DataFrame containing the PSMs with added DeepLC features:
- - original_seq: Original peptide sequence
- - observed_retention_time: Original retention time
- - predicted_retention_time: DeepLC predicted retention time
- - retention_time_diff: Difference between predicted and observed times
- - abs_retention_time_diff: Absolute difference between predicted and observed times
- - retention_time_ratio: Ratio of min(pred,obs) to max(pred,obs)
-
- Notes
- -----
- This method:
- 1. Prepares data in DeepLC format
- 2. Calibrates DeepLC if calibration set is specified
- 3. Predicts retention times
- 4. Calculates various retention time-based features
- 5. Handles missing values by imputing with median values
- """
- logger.info("Generating DeepLC features.")
-
- # Extract DeepLC input DataFrame
- self.deeplc_df=self._get_deeplc_df()
-
- # Calibrate DeepLC predictor
- ifself.calibration_set_size:
- calibration_df=self._get_calibration_psms(self.deeplc_df)
- logger.debug(f"Calibrating DeepLC with {len(calibration_df)} PSMs.")
- self.deeplc_predictor.calibrate_preds(
- seq_df=calibration_df[["seq","tr","modifications"]]
- )
-
- # Predict retention times
- logger.info("Predicting retention times using DeepLC.")
- predictions=self.deeplc_predictor.make_preds(
- seq_df=self.deeplc_df[["seq","tr","modifications"]]
- )
-
- self._raw_predictions=pd.DataFrame(
- {
- "peptide":self.deeplc_df["seq"],
- "predicted_rt":predictions,
- "observed_rt":self.deeplc_df["tr"],
- "modifications":self.deeplc_df["modifications"],
- }
- )
-
- # Calculate retention time differences
- rt_diffs=predictions-self.deeplc_df["tr"]
- self.deeplc_df["predicted_retention_time"]=predictions
- self.deeplc_df["retention_time_diff"]=rt_diffs
-
- result_df=pd.DataFrame()
- result_df["original_seq"]=self.deeplc_df["original_seq"]
- result_df["observed_retention_time"]=self.deeplc_df["tr"]
- result_df["predicted_retention_time"]=self.deeplc_df[
- "predicted_retention_time"
- ]
- result_df["retention_time_diff"]=self.deeplc_df["retention_time_diff"]
- result_df["abs_retention_time_diff"]=self.deeplc_df[
- "retention_time_diff"
- ].abs()
-
- # Adopt from 'DeepRescore2': RTR = min(pred, obs) / max(pred, obs)
- result_df["retention_time_ratio"]=np.minimum(
- result_df["predicted_retention_time"],result_df["observed_retention_time"]
- )/np.maximum(
- result_df["predicted_retention_time"],result_df["observed_retention_time"]
- )
-
- forcolinself.feature_columns:
- nan_rows=result_df[result_df[col].isna()]
- ifnotnan_rows.empty:
- logger.warning(
- f"Column {col} contains NaN values. Rows with NaN values:\n{nan_rows}"
- )
- median_value=result_df[col].median()
- result_df[col].fillna(median_value,inplace=True)
- result_df[col]=result_df[col].astype(float)
-
- returnresult_df
-
-
-
-[docs]
- def_get_calibration_psms(self,deeplc_df:pd.DataFrame)->pd.DataFrame:
-"""
- Get the best scoring PSMs for calibration based on the calibration criteria.
-
- Parameters
- ----------
- deeplc_df : pd.DataFrame
- DataFrame containing DeepLC input data.
-
- Returns
- -------
- pd.DataFrame
- DataFrame of PSMs selected for calibration, containing only target PSMs.
-
- Raises
- ------
- ValueError
- If calibration_set_size is a float not between 0 and 1.
- TypeError
- If calibration_set_size is neither int nor float.
-
- Notes
- -----
- This method:
- 1. Sorts PSMs based on calibration criteria
- 2. Selects top N PSMs based on calibration_set_size
- 3. Filters to keep only target PSMs
- """
- logger.debug("Selecting PSMs for calibration.")
-
- # Sort PSMs based on calibration criteria
- sorted_psms=deeplc_df.sort_values(
- by="score",ascending=self.lower_score_is_better
- )
-
- # Select calibration set
- ifisinstance(self.calibration_set_size,float):
- ifnot0<self.calibration_set_size<=1:
- logger.error("calibration_set_size as float must be between 0 and 1.")
- raiseValueError(
- "If `calibration_set_size` is a float, it must be between 0 and 1."
- )
- n_cal=int(len(sorted_psms)*self.calibration_set_size)
- elifisinstance(self.calibration_set_size,int):
- n_cal=self.calibration_set_size
- ifn_cal>len(sorted_psms):
- logger.warning(
- f"Requested calibration_set_size ({n_cal}) exceeds number of PSMs ({len(sorted_psms)}). Using all PSMs for calibration."
- )
- n_cal=len(sorted_psms)
- else:
- logger.error("calibration_set_size must be either int or float.")
- raiseTypeError(
- "Expected int or float for `calibration_set_size`. "
- f"Got {type(self.calibration_set_size)} instead."
- )
-
- calibration_psms=sorted_psms.head(n_cal)
- logger.debug(f"Selected {n_cal} PSMs for calibration.")
- calibration_psms=calibration_psms[calibration_psms["label"]==True]
- logger.debug(f"Selected {len(calibration_psms)} target PSMs for calibration.")
- returncalibration_psms
-
-
-
-[docs]
- defget_full_data(self)->pd.DataFrame:
-"""
- Get the full DeepLC DataFrame.
-
- Returns
- -------
- pd.DataFrame
- DataFrame containing the DeepLC input data with all columns:
- - original_seq: Original peptide sequence
- - label: Target/decoy label
- - seq: Cleaned peptide sequence
- - modifications: Unimod format modifications
- - tr: Retention time
- - score: Calibration criteria score
- - predicted_retention_time: DeepLC predicted retention time
- - retention_time_diff: Difference between predicted and observed times
- """
- returnself.deeplc_df
-
-
- @property
- defraw_predictions(self)->pd.DataFrame:
-"""
- Get the raw predictions DataFrame.
-
- Returns
- -------
- pd.DataFrame
- DataFrame containing the raw predictions:
- - peptide: Cleaned peptide sequence
- - predicted_rt: DeepLC predicted retention time
- - observed_rt: Original retention time
- - modifications: Unimod format modifications
-
- Notes
- -----
- If predictions haven't been generated yet, this will trigger
- feature generation automatically.
- """
- ifself._raw_predictionsisNone:
- self.generate_features()
- returnself._raw_predictions
-
-
-[docs]
- defget_raw_predictions(self)->pd.DataFrame:
-"""
- Get the raw predictions DataFrame.
-
- Returns
- -------
- pd.DataFrame
- DataFrame containing the raw predictions:
- - peptide: Cleaned peptide sequence
- - predicted_rt: DeepLC predicted retention time
- - observed_rt: Original retention time
- - modifications: Unimod format modifications
-
- Notes
- -----
- This is a convenience method that returns the same data as the
- raw_predictions property.
- """
- returnself.raw_predictions
-
-
-
-[docs]
- defsave_raw_predictions(self,file_path:str,**kwargs)->None:
-"""
- Save the raw prediction results to a file.
-
- Parameters
- ----------
- file_path : str
- Path to save the file.
- **kwargs : dict
- Additional parameters passed to pandas.DataFrame.to_csv.
- If 'index' is not specified, it defaults to False.
-
- Notes
- -----
- This method saves the raw predictions DataFrame to a CSV file.
- The DataFrame includes:
- - peptide: Cleaned peptide sequence
- - predicted_rt: DeepLC predicted retention time
- - observed_rt: Original retention time
- - modifications: Unimod format modifications
- """
- if"index"notinkwargs:
- kwargs["index"]=False
- ifself.raw_predictionsisnotNone:
- self.raw_predictions.to_csv(file_path,**kwargs)
- logger.info(f"Raw predictions saved to {file_path}")
- else:
- logger.warning("Raw predictions have not been generated yet.")
-# feature_generator/PWM.py
-
-importos
-importpandasaspd
-importnumpyasnp
-fromtypingimportOptional,Dict,Union,List,Tuple
-fromoptimhc.feature_generator.base_feature_generatorimportBaseFeatureGenerator
-importlogging
-fromoptimhcimportutils
-
-logger=logging.getLogger(__name__)
-
-# n_flank_pwm and c_flank_pwm are used for MHC class II PWM calculation.
-# The original PPM is recorded in data/PWMs/II/n_flank_ppm and data/PWMs/II/c_flank_ppm,
-# which is converted to PWM by taking the background frequency as 0.05.
-
-c_flank_pwm_data={
- "A":[0.891194,0.599125,0.567353],
- "C":[-3.952777,-4.345220,-4.612584],
- "D":[0.291173,0.550133,0.325690],
- "E":[0.687212,0.662834,0.834717],
- "F":[-1.250652,-0.784627,-1.139232],
- "G":[0.509354,-0.368370,0.919885],
- "H":[-0.808229,-0.836628,-0.591508],
- "I":[-0.046196,-0.034123,-1.564360],
- "K":[0.452471,0.665617,1.086677],
- "L":[0.522681,0.382607,0.208291],
- "M":[-2.131278,-2.144693,-2.008653],
- "N":[-0.208044,-0.699085,0.022668],
- "P":[0.417673,1.179052,-0.018850],
- "Q":[-0.033656,-0.051822,0.274208],
- "R":[0.535173,0.397917,0.924583],
- "S":[0.542669,0.255087,0.360228],
- "T":[-0.359617,-0.322741,-0.028565],
- "V":[0.507861,0.582748,-0.584754],
- "W":[-3.432776,-3.024952,-3.391619],
- "Y":[-1.144700,-0.725718,-1.221789],
- # Add a pseudo-entry for 'X' with 0 contribution
- "X":[0.0,0.0,0.0],
-}
-
-c_flank_pwm=pd.DataFrame.from_dict(
- c_flank_pwm_data,orient="index",columns=["Pos1","Pos2","Pos3"]
-)
-
-n_flank_pwm_data={
- "A":[0.672938,0.494511,0.290216],
- "C":[-5.464582,-5.140732,-5.112201],
- "D":[0.856850,0.732683,0.398964],
- "E":[0.692225,0.660452,0.746641],
- "F":[-1.024461,-1.529751,-0.687119],
- "G":[0.873872,0.746332,0.630604],
- "H":[-1.386627,-1.212169,-1.011943],
- "I":[0.138351,-0.461093,0.095419],
- "K":[0.801095,0.639492,0.847272],
- "L":[0.562420,-0.162599,0.201511],
- "M":[-2.230132,-2.754557,-2.397489],
- "N":[-0.198452,-0.099572,-0.214853],
- "P":[-1.491966,1.405721,0.532710],
- "Q":[-0.622442,-0.151155,-0.006518],
- "R":[0.216375,0.217991,0.545768],
- "S":[0.623057,0.416164,0.236042],
- "T":[0.160517,0.011151,-0.111494],
- "V":[0.523780,0.239183,0.330202],
- "W":[-3.276340,-4.050898,-2.959115],
- "Y":[-1.060520,-1.689795,-0.674071],
- # Add a pseudo-entry for 'X' with 0 contribution
- "X":[0.0,0.0,0.0],
-}
-
-n_flank_pwm=pd.DataFrame.from_dict(
- n_flank_pwm_data,orient="index",columns=["Pos1","Pos2","Pos3"]
-)
-
-
-
-[docs]
-classPWMFeatureGenerator(BaseFeatureGenerator):
-"""
- Generates PWM (Position Weight Matrix) features for peptides based on specified MHC alleles.
-
- This generator calculates PWM scores for each peptide against the provided MHC class I or II allele PWMs.
-
- Parameters
- ----------
- peptides : list of str
- Series of peptide sequences.
- alleles : list of str
- List of MHC allele names (e.g., ['HLA-A01:01', 'HLA-B07:02']).
- anchors : int, optional
- Number of anchor positions to consider for MHC class I. Default is 2.
- mhc_class : str, optional
- MHC class, either 'I' or 'II'. Default is 'I'.
- pwm_path : str or os.PathLike, optional
- Custom path to PWM files. Defaults to '../../data/PWMs'.
- remove_pre_nxt_aa : bool, optional
- Whether to include the previous and next amino acids in peptides.
- If True, remove them. Default is False.
- remove_modification : bool, optional
- Whether to include modifications in peptides.
- If True, remove them. Default is True.
-
- Attributes
- ----------
- peptides : pd.Series
- Series of peptide sequences.
- alleles : list of str
- List of MHC allele names.
- mhc_class : str
- MHC class ('I' or 'II').
- pwm_path : str or os.PathLike
- Path to PWM files.
- pwms : dict
- Dictionary of PWMs for each allele and mer length.
- anchors : int
- Number of anchor positions for MHC class I.
- remove_pre_nxt_aa : bool
- Whether to remove pre/post neighboring amino acids.
- remove_modification : bool
- Whether to remove modifications.
-
- Notes
- -----
- For MHC class I:
- - Generates 'PWM_Score_{allele}' and optionally 'Anchor_Score_{allele}' columns.
- For MHC class II:
- - Generates 'PWM_Score_{allele}' (core 9-mer),
- - 'N_Flank_PWM_Score_{allele}',
- - 'C_Flank_PWM_Score_{allele}' columns.
- """
-
- CLASS_II_CORE_LENGTH=9
- DEFAULT_PWM_PATH=os.path.join(os.path.dirname(__file__),"..","PWMs")
-
-
-[docs]
- def__init__(
- self,
- peptides:List[str],
- alleles:List[str],
- anchors:int=2,
- mhc_class:str="I",
- pwm_path:Optional[Union[str,os.PathLike]]=None,
- remove_pre_nxt_aa:bool=False,
- remove_modification:bool=True,
- *args,
- **kwargs,
- ):
-"""
- Initializes the PWMFeatureGenerator.
-
- Parameters:
- peptides (List[str]): Series of peptide sequences.
- alleles (List[str]): List of MHC allele names (e.g., ['HLA-A01:01', 'HLA-B07:02']).
- mhc_class (str): MHC class, either 'I' or 'II'. Default is 'I'.
- pwm_path (Optional[Union[str, os.PathLike]]): Custom path to PWM files. Defaults to '../../data/PWMs'.
- remove_pre_nxt_aa (bool): Whether to include the previous and next amino acids in peptides.
- If True, remove them. Default is False.
- remove_modification (bool): Whether to include modifications in peptides.
- If True, remove them. Default is True.
- """
- self.peptides=pd.Series(peptides)
- self.alleles=alleles
- self.mhc_class=mhc_class.upper()
- ifself.mhc_classnotin{"I","II"}:
- raiseValueError("MHC class must be 'I' or 'II'.")
- self.pwm_path=pwm_pathifpwm_pathelsePWMFeatureGenerator.DEFAULT_PWM_PATH
- logger.info(f"PWM path: {self.pwm_path}")
- self.pwms:Dict[str,Dict[int,pd.DataFrame]]=(
- self._load_pwms()
- )# Dict[allele, Dict[mer, pd.DataFrame]]
-
- forallele,pwmsinself.pwms.items():
- former,pwminpwms.items():
- logger.debug(
- f"Loaded PWM for allele {allele}, length {mer}: {pwm.shape[1]} positions"
- )
- logger.debug(pwm)
- self.anchors=anchors
- ifmhc_class=="I"andanchors>0:
- logger.info("Number of anchors: {}".format(self.anchors))
- self.remove_pre_nxt_aa=remove_pre_nxt_aa
- logger.info(
- "Remove pre and next amino acids: {}".format(self.remove_pre_nxt_aa)
- )
- self.remove_modification=remove_modification
- logger.info("Remove modifications: {}".format(self.remove_modification))
- logger.info(
- f"Initialized PWMFeatureGenerator with {len(peptides)} peptides and alleles: {alleles}"
- )
-
-
- @property
- defid_column(self)->List[str]:
-"""
- Get a list of input columns required for the feature generator.
-
- Returns
- -------
- list of str
- List of column names required for feature generation.
- """
- return["Peptide"]
-
-
-[docs]
- def_extract_trailing_numbers(self,text:str)->str:
-"""
- Extract the trailing numbers from a string.
-
- Parameters
- ----------
- text : str
- Input string to extract numbers from.
-
- Returns
- -------
- str or None
- The trailing numbers if found, None otherwise.
-
- Examples
- --------
- >>> _extract_trailing_numbers('ABC123')
- '123'
- >>> _extract_trailing_numbers('ABC')
- None
- >>> _extract_trailing_numbers('L123')
- '123'
- """
- importre
-
- match=re.search(r"\d+$",text)
- returnmatch.group()ifmatchelseNone
-
-
-
-[docs]
- def_default_allele_pwm_files(self)->Dict[str,Dict[int,str]]:
-"""
- Construct default PWM file paths for each allele based on MHC class.
-
- Returns
- -------
- dict of str to dict of int to str
- Dictionary mapping alleles to their PWM files for each mer length.
- Format: {allele: {mer_length: file_path}}
-
- Notes
- -----
- For MHC class I:
- - Allele directory format: HLA-A01:01 -> HLA-A01_01
- For MHC class II:
- - Allele directory format: DRB10101 -> DRB1_0101
- - Fixed core length of 9
- """
- class_path=os.path.join(self.pwm_path,self.mhc_class)
- pwm_files={}
- foralleleinself.alleles:
- pwm_files[allele]={}
- ifself.mhc_class=="I":
- allele_dir=allele.replace("*","").replace(":","_")
- elifself.mhc_class=="II":
- allele_dir=allele.replace("*","").replace(":","")
- iflen(allele_dir)==8:
- # DRB10101 -> DRB1_0101
- allele_dir=f"{allele_dir[:4]}_{allele_dir[4:]}"
- allele_dir_path=os.path.join(class_path,allele_dir)
- pwm_files_list=utils.list_all_files_in_directory(allele_dir_path)
- logger.info(
- f"Searching for PWM files for allele {allele} in {allele_dir_path}"
- )
- logger.debug(f"Found PWM files for allele {allele}: {pwm_files_list}")
- ifself.mhc_class=="I":
- forpwm_fileinpwm_files_list:
- # assume the trailing numbers in the file name indicate the mer length
- mer=int(self._extract_trailing_numbers(str(pwm_file)))
- ifmerisnotNone:
- pwm_files[allele][mer]=pwm_file
- else:
- logger.error(
- f"Mer length not found in PWM file name: {pwm_file}. Assuming the tailing number is the mer length."
- )
- raiseValueError(
- f"Mer length not found in PWM file name: {pwm_file}"
- )
- elifself.mhc_class=="II":
- # class II PWMs are fixed length: 9, which indicates the core length
- iflen(pwm_files_list)==1:
- pwm_files[allele][PWMFeatureGenerator.CLASS_II_CORE_LENGTH]=(
- pwm_files_list[0]
- )
- else:
- logger.error(
- f"Expected 1 PWM file for allele {allele}, found {len(pwm_files_list)}: {pwm_files_list}"
- )
- raiseValueError(
- f"Expected 1 PWM file for allele {allele}, found {len(pwm_files_list)}: {pwm_files_list}"
- )
-
- logger.debug(f"Default PWM file paths set for alleles: {self.alleles}")
- returnpwm_files
-
-
-
-[docs]
- def_most_conserved_postions(self,pwm:pd.DataFrame,n:int=2)->List[int]:
-"""
- Find the n most conserved positions in the PWM.
-
- Parameters
- ----------
- pwm : pd.DataFrame
- Position Weight Matrix to analyze.
- n : int, optional
- Number of positions to return. Default is 2.
-
- Returns
- -------
- list of int
- Indices of the n most conserved positions.
-
- Raises
- ------
- ValueError
- If n exceeds the PWM length.
-
- Notes
- -----
- In our study, we only use the anchor score for class I MHC.
- Conservation is measured using Shannon entropy.
- """
- ifn>pwm.shape[1]:
- raiseValueError(
- f"Number of positions to return ({n}) exceeds the PWM length ({pwm.shape[1]})."
- )
- pfm=pwm.apply(lambdax:2**x)
- entropy=-1*(pfm*np.log2(pfm)).sum(axis=0)
- returnentropy.nsmallest(n).index.tolist()
-
-
-
-[docs]
- def_load_pwms(self)->Dict[str,Dict[int,pd.DataFrame]]:
-"""
- Load PWMs for each allele from the constructed file paths.
-
- Returns
- -------
- dict of str to dict of int to pd.DataFrame
- Dictionary of PWMs for each allele and mer length.
- Format: {allele: {mer_length: pwm_dataframe}}
-
- Raises
- ------
- FileNotFoundError
- If a PWM file is not found.
- Exception
- If there is an error loading a PWM file.
-
- Notes
- -----
- PWM files are expected to be space-delimited text files with amino acids as row indices
- and positions as column indices.
- """
- pwms={}
- allele_pwm_files=self._default_allele_pwm_files()
- forallele,mer_filesinallele_pwm_files.items():
- pwms[allele]={}
- former,file_pathinmer_files.items():
- ifnotos.path.exists(file_path):
- logger.error(f"PWM file not found: {file_path}")
- raiseFileNotFoundError(f"PWM file not found: {file_path}")
- try:
- pwm=pd.read_csv(
- file_path,sep='\s+',header=None,index_col=0
- )
- pwm.columns=[f"{pos+1}"forposinrange(pwm.shape[1])]
- pwms[allele][mer]=pwm
- logger.debug(
- f"Loaded PWM for allele {allele}, length {mer} from {file_path}"
- )
- exceptExceptionase:
- logger.error(f"Error loading PWM file {file_path}: {e}")
- raisee
- returnpwms
-
-
-
-[docs]
- def_cal_PWM_score_I(self,peptide:str,allele:str)->Optional[float]:
-"""
- Calculate PWM scores for MHC class I.
-
- Parameters
- ----------
- peptide : str
- The peptide sequence to score.
- allele : str
- The MHC allele to score against.
-
- Returns
- -------
- float or None
- PWM score for the peptide against the allele's PWM, or None if out of range.
-
- Notes
- -----
- If peptide length is out of range, returns None.
- """
- peptide_len=len(peptide)
- min_mer=min(self.pwms[allele].keys())
- max_mer=max(self.pwms[allele].keys())
-
- ifpeptide_len<min_merorpeptide_len>max_mer:
- returnpd.NA
- else:
- pwm=self.pwms[allele][peptide_len]
- try:
- score=sum(pwm.loc[aa,str(i+1)]fori,aainenumerate(peptide))
- exceptKeyErrorase:
- logger.error(
- f"Residue '{e}' not found in PWM for allele={allele}, "
- f"peptide={peptide}, length={peptide_len}"
- )
- raiseValueError(f"Residue '{e}' not found in PWM.")
- returnscore
-
-
-
-[docs]
- def_cal_PWM_score_II(
- self,peptide:str,allele:str
- )->Tuple[Optional[float],Optional[float],Optional[float]]:
-"""
- Calculate PWM scores for MHC class II using a sliding 9-mer window.
-
- Parameters
- ----------
- peptide : str
- The peptide sequence to score.
- allele : str
- The MHC allele to score against.
-
- Returns
- -------
- tuple of (float or None, float or None, float or None)
- A tuple containing:
- - core_score : float or None
- Score for the best 9-mer core.
- - n_flank_score : float or None
- Score for the N-terminal flanking region.
- - c_flank_score : float or None
- Score for the C-terminal flanking region.
- Returns (None, None, None) if peptide has length < 9.
-
- Notes
- -----
- The method:
- 1. Slides over all possible 9-mer windows to find the highest core PWM score.
- 2. Once the best core is found, extracts up to 3 AA on each flank (N-flank and C-flank).
- 3. If the flank has fewer than 3 residues, pads with 'X'.
- 4. Scores each flank with n_flank_pwm and c_flank_pwm.
- """
- core_len=PWMFeatureGenerator.CLASS_II_CORE_LENGTH
- pwm=self.pwms[allele][core_len]
-
- iflen(peptide)<core_len:
- # Return NaNs for the scores if peptide too short
- return(pd.NA,pd.NA,pd.NA)
-
- best_score=None
- best_core=None
- best_core_start_idx=0
-
- # Slide over possible 9-mer windows
- forstart_idxinrange(len(peptide)-core_len+1):
- subpeptide_9=peptide[start_idx:start_idx+core_len]
- try:
- tmp_score=sum(
- float(pwm.loc[aa,str(i+1)])fori,aainenumerate(subpeptide_9)
- )
- exceptKeyErrorase:
- logger.error(
- f"Residue '{e}' not found in PWM for allele={allele}, "
- f"peptide={peptide} (subpep={subpeptide_9})."
- )
- raiseValueError(f"Residue '{e}' not found in PWM.")
-
- if(best_scoreisNone)or(tmp_score>best_score):
- best_score=tmp_score
- best_core=subpeptide_9
- best_core_start_idx=start_idx
-
- logger.debug(
- f"Peptide: {peptide}, best core: {best_core}, "
- f"start_idx: {best_core_start_idx}, score: {best_score}"
- )
-
- # Compute N-flank and C-flank (up to 3 residues)
- n_flank_start=max(0,best_core_start_idx-3)
- n_flank_end=best_core_start_idx
- c_flank_start=best_core_start_idx+core_len
- c_flank_end=min(len(peptide),c_flank_start+3)
-
- n_flank_seq=peptide[n_flank_start:n_flank_end]
- c_flank_seq=peptide[c_flank_start:c_flank_end]
-
- # Pad with 'X' if flank < 3 residues
- n_flank_seq=(("X"*(3-len(n_flank_seq)))+n_flank_seq)[
- -3:
- ]# ensure length = 3
- c_flank_seq=(c_flank_seq+("X"*(3-len(c_flank_seq))))[
- :3
- ]# ensure length = 3
-
- n_flank_score=0.0
- c_flank_score=0.0
- fori,aainenumerate(n_flank_seq):
- try:
- n_flank_score+=n_flank_pwm.loc[aa,f"Pos{i+1}"]
- exceptKeyError:
- # 'X' or unknown -> 0
- pass
-
- fori,aainenumerate(c_flank_seq):
- try:
- c_flank_score+=c_flank_pwm.loc[aa,f"Pos{i+1}"]
- exceptKeyError:
- # 'X' or unknown -> 0
- pass
-
- logger.debug(
- f"n_flank: {n_flank_seq}, c_flank: {c_flank_seq}, "
- f"n_flank_score: {n_flank_score}, c_flank_score: {c_flank_score}"
- )
-
- return(best_score,n_flank_score,c_flank_score)
-
-
-
-[docs]
- def_cal_PWM_score(
- self,peptide:str,allele:str
- )->Union[float,Tuple[float,float,float]]:
-"""
- Calculates PWM scores for a single peptide across all applicable mer lengths for a given allele.
-
- For MHC class I:
- Returns a single float (or pd.NA).
- For MHC class II:
- Returns a tuple of (core_score, n_flank_score, c_flank_score) or (pd.NA, pd.NA, pd.NA).
- """
- ifself.mhc_class=="I":
- returnself._cal_PWM_score_I(peptide,allele)
- else:
- returnself._cal_PWM_score_II(peptide,allele)
-
-
-
-[docs]
- def_cal_anchor_score(
- self,peptide:str,allele:str,anchor_dict:Dict[int,List[int]]
- )->Optional[float]:
-"""
- Calculate anchor score for a single peptide across all applicable mer lengths for a given allele.
-
- Parameters
- ----------
- peptide : str
- The peptide sequence to score.
- allele : str
- The MHC allele to score against.
- anchor_dict : dict of int to list of int
- Dictionary containing the most conserved positions for each mer length.
-
- Returns
- -------
- float or None
- Anchor score for the peptide against the allele's PWM, or None if out of range.
-
- Notes
- -----
- Only implemented for MHC class I. For MHC class II, returns None.
- """
- peptide_len=len(peptide)
- ifself.mhc_class=="I":
- min_mer=min(self.pwms[allele].keys())
- max_mer=max(self.pwms[allele].keys())
- ifpeptide_len<min_merorpeptide_len>max_mer:
- returnpd.NA
- else:
- pwm=self.pwms[allele].get(peptide_len)
- score=0
- anchors=anchor_dict[peptide_len]
- try:
- foranchorinanchors:
- anchor=int(anchor)
- score+=pwm.loc[peptide[anchor-1],str(anchor)]
- exceptKeyErrorase:
- logger.error(
- f"Position '{e}' not found in PWM for allele {allele} and peptide {peptide} with length {peptide_len}."
- )
- raiseValueError(f"Position '{e}' not found in PWM.")
- returnscore
- elifself.mhc_class=="II":
- logger.warning("Anchor score calculation not implemented for MHC class II.")
- returnNone
-
-
-
-[docs]
- defset_pwms(self,pwms:Dict[str,Dict[int,pd.DataFrame]]):
-"""
- Set PWMs directly, allowing for custom PWMs to be provided.
-
- Parameters
- ----------
- pwms : dict of str to dict of int to pd.DataFrame
- Dictionary of PWMs for each allele and mer length.
- Format: {allele: {mer_length: pwm_dataframe}}
- """
- self.pwms=pwms
- logger.info(f"Set custom PWMs for alleles: {list(pwms.keys())}")
-
-
-
-[docs]
- defgenerate_features(self)->pd.DataFrame:
-"""
- Generate PWM features for all peptides across specified alleles.
-
- Returns
- -------
- pd.DataFrame
- DataFrame containing generated features:
- For MHC class I:
- - 'PWM_Score_{allele}' and optionally 'Anchor_Score_{allele}' columns.
- For MHC class II:
- - 'PWM_Score_{allele}' (core 9-mer),
- - 'N_Flank_PWM_Score_{allele}',
- - 'C_Flank_PWM_Score_{allele}' columns.
-
- Notes
- -----
- Missing values are imputed with the median value for each feature.
- """
- features_df=pd.DataFrame(self.peptides,columns=["Peptide"])
- features_df["clean_peptide"]=features_df["Peptide"]
- ifself.remove_pre_nxt_aa:
- features_df["clean_peptide"]=features_df["Peptide"].apply(
- utils.remove_pre_and_nxt_aa
- )
- ifself.remove_modification:
- features_df["clean_peptide"]=features_df["clean_peptide"].apply(
- utils.remove_modifications
- )
-
- # Convert nonstandard amino acids: U -> C
- features_df["clean_peptide"]=features_df["clean_peptide"].apply(
- lambdax:x.replace("U","C")
- )
-
- foralleleinself.alleles:
- logger.info(
- f"Generating PWM scores for allele: {allele}, total peptides: {len(features_df)}"
- )
-
- ifself.mhc_class=="I":
- # Class I returns a single score
- features_df[f"PWM_Score_{allele}"]=features_df["clean_peptide"].apply(
- lambdapeptide:self._cal_PWM_score(peptide,allele)
- )
- na_count=features_df[f"PWM_Score_{allele}"].isna().sum()
- logger.info(
- f"Missing PWM scores for {na_count} peptides. Using median for imputation."
- )
- features_df.fillna(
- {
- f"PWM_Score_{allele}":features_df[
- f"PWM_Score_{allele}"
- ].median()
- },
- inplace=True,
- )
-
- ifself.anchors!=0:
- logger.info(
- f"Generating anchor scores for allele: {allele}, total peptides: {len(features_df)}"
- )
- anchor_dict={}
- min_mer=min(self.pwms[allele].keys())
- max_mer=max(self.pwms[allele].keys())
- former_leninrange(min_mer,max_mer+1):
- anchor_dict[mer_len]=self._most_conserved_postions(
- self.pwms[allele][mer_len],self.anchors
- )
- logger.info(
- f"Most conserved positions for allele {allele}: {anchor_dict}"
- )
- features_df[f"Anchor_Score_{allele}"]=features_df[
- "clean_peptide"
- ].apply(
- lambdapeptide:self._cal_anchor_score(
- peptide,allele,anchor_dict
- )
- )
- na_count=features_df[f"Anchor_Score_{allele}"].isna().sum()
- logger.info(
- f"Missing anchor scores for {na_count} peptides. Using median for imputation."
- )
- features_df.fillna(
- {
- f"Anchor_Score_{allele}":features_df[
- f"Anchor_Score_{allele}"
- ].median()
- },
- inplace=True,
- )
-
- else:
- # Class II returns (core_score, n_flank_score, c_flank_score)
- features_df[
- [
- f"PWM_Score_{allele}",
- f"N_Flank_PWM_Score_{allele}",
- f"C_Flank_PWM_Score_{allele}",
- ]
- ]=features_df["clean_peptide"].apply(
- lambdapep:pd.Series(self._cal_PWM_score(pep,allele))
- )
-
- # Impute missing (NaN) with medians for each new column
- forcolin[
- f"PWM_Score_{allele}",
- f"N_Flank_PWM_Score_{allele}",
- f"C_Flank_PWM_Score_{allele}",
- ]:
- na_count=features_df[col].isna().sum()
- logger.info(
- f"Missing {col} for {na_count} peptides. Using median for imputation."
- )
- median_val=features_df[col].median()
- features_df.loc[:,col]=features_df[col].fillna(median_val)
- features_df[col]=features_df[col].infer_objects(copy=False)
-
- features_df.drop(columns=["clean_peptide"],inplace=True)
-
- returnfeatures_df
-
-
- @property
- deffeature_columns(self)->List[str]:
-"""
- Returns a list of feature names generated by the feature generator.
- """
- ifself.mhc_class=="I":
- feature_columns=[f"PWM_Score_{allele}"foralleleinself.alleles]
- ifself.anchors!=0:
- anchor_columns=[f"Anchor_Score_{allele}"foralleleinself.alleles]
- feature_columns.extend(anchor_columns)
- returnfeature_columns
- else:
- # Class II
- feature_columns=[]
- foralleleinself.alleles:
- feature_columns.append(f"PWM_Score_{allele}")
- feature_columns.append(f"N_Flank_PWM_Score_{allele}")
- feature_columns.append(f"C_Flank_PWM_Score_{allele}")
- returnfeature_columns
-[docs]
-classBaseFeatureGenerator(ABC):
-"""
- Abstract base class for all feature generators in the rescoring pipeline.
- """
-
- @property
- @abstractmethod
- deffeature_columns(self)->List[str]:
-"""
- Returns a list of feature names generated by the feature generator.
- """
- pass
-
- @property
- @abstractmethod
- defid_column(self)->List[str]:
-"""
- Returns the column or columns used as key or keys to merge features with PSMs.
- """
- pass
-
-
-[docs]
-classBasicFeatureGenerator(BaseFeatureGenerator):
-"""
- Feature generator that generates basic features from peptide sequences.
-
- This generator calculates features such as peptide length, proportion of unique amino acids,
- Shannon entropy of amino acid distribution, difference between peptide length and average peptide length,
- and count of unique amino acids.
-
- Parameters
- ----------
- peptides : List[str]
- List of peptide sequences to generate features for.
- remove_pre_nxt_aa : bool, optional
- Whether to remove the amino acids adjacent to the peptide.
- If True, removes them. Default is True.
- remove_modification : bool, optional
- Whether to remove modifications in the peptide sequences.
- If True, removes them. Default is True.
-
- Notes
- -----
- The generated features include:
- - length_diff_from_avg: Difference between peptide length and average length
- - abs_length_diff_from_avg: Absolute difference between peptide length and average length
- - unique_aa_count: Number of unique amino acids in the peptide
- - unique_aa_proportion: Proportion of unique amino acids in the peptide
- - shannon_entropy: Shannon entropy of amino acid distribution
- """
-
- def__init__(
- self,
- peptides:List[str],
- remove_pre_nxt_aa:bool=True,
- remove_modification:bool=True,
- *args,
- **kwargs,
- ):
- super().__init__(*args,**kwargs)
- self.peptides=peptides
- self.remove_pre_nxt_aa=remove_pre_nxt_aa
- self.remove_modification=remove_modification
- self.avg_length=None
- logger.info(f"Initialized BasicFeatureGenerator with {len(peptides)} peptides.")
-
- @property
- deffeature_columns(self)->List[str]:
-"""Return the list of generated feature column names."""
- return[
- #'peptide_length',
- "length_diff_from_avg",
- "abs_length_diff_from_avg",
- "unique_aa_count",
- "unique_aa_proportion",
- "shannon_entropy",
- ]
-
- @property
- defid_column(self)->List[str]:
-"""
- Return the list of input columns required for feature generation.
-
- Returns
- -------
- List[str]
- List of input column names required for feature generation.
- Currently only requires 'Peptide' column.
- """
- return["Peptide"]
-
-
-[docs]
- def_shannon_entropy(self,sequence:str)->float:
-"""
- Calculate the Shannon entropy of a peptide sequence.
-
- Parameters:
- sequence (str): Peptide sequence.
-
- Returns:
- float: Shannon entropy value.
- """
- iflen(sequence)==0:
- return0.0
- # Calculate frequency of each unique amino acid
- bases=list(set(sequence))
- freq_list=[sequence.count(base)/len(sequence)forbaseinbases]
- returnentropy(freq_list,base=2)
-
-
-
-[docs]
- defgenerate_features(self)->pd.DataFrame:
-"""
- Generate basic features for the provided peptides.
-
- Returns
- -------
- pd.DataFrame
- DataFrame containing peptides and their computed features:
- - length_diff_from_avg: Difference from average peptide length
- - abs_length_diff_from_avg: Absolute difference from average length
- - unique_aa_count: Number of unique amino acids
- - unique_aa_proportion: Proportion of unique amino acids
- - shannon_entropy: Shannon entropy of amino acid distribution
-
- Raises
- ------
- ValueError
- If NaN values are found in the generated features.
-
- Notes
- -----
- All features are converted to float type before returning.
- The method calculates average peptide length across all peptides
- and uses it as a reference for length-based features.
- """
- logger.info("Generating basic features.")
- peptides_df=pd.DataFrame(self.peptides,columns=["Peptide"])
- peptides_df["clean_peptide"]=peptides_df["Peptide"].apply(
- self._preprocess_peptide
- )
- peptides_df["peptide_length"]=peptides_df["clean_peptide"].apply(len)
- self.avg_length=peptides_df["peptide_length"].mean()
- peptides_df["length_diff_from_avg"]=(
- peptides_df["peptide_length"]-self.avg_length
- )
- peptides_df["abs_length_diff_from_avg"]=peptides_df[
- "length_diff_from_avg"
- ].abs()
- peptides_df["unique_aa_count"]=peptides_df["clean_peptide"].apply(
- lambdax:len(set(x))
- )
- peptides_df["unique_aa_proportion"]=(
- peptides_df["unique_aa_count"]/peptides_df["peptide_length"]
- )
- peptides_df["shannon_entropy"]=peptides_df["clean_peptide"].apply(
- self._shannon_entropy
- )
- features_df=peptides_df[["Peptide"]+self.feature_columns]
- # Fix SettingWithCopyWarning: make an explicit copy before assignment
- features_df=features_df.copy()
- forcolinself.feature_columns:
- features_df[col]=features_df[col].astype(float)
- iffeatures_df.isna().sum().sum()>0:
- logger.error("NaN values found in the generated features.")
- raiseValueError("NaN values found in the generated features.")
-
- logger.info(f"Generated basic features for {len(features_df)} peptides.")
- returnfeatures_df
-[docs]
- defget_best_allele(self)->pd.DataFrame:
-"""
- Get the best allele for each peptide.
-
- Returns
- -------
- pd.DataFrame
- DataFrame containing the best alleles for the peptides:
- - Peptide: Original peptide sequence
- - mhcflurry_best_allele: Best binding allele
-
- Notes
- -----
- The best allele is determined by the lowest presentation percentile rank.
- """
- best_allele_df=self.predictions[["Peptide","best_allele"]]
- best_allele_df.rename(
- columns={"best_allele":"mhcflurry_best_allele"},inplace=True
- )
-
- logger.info(
- f"Generated best allele information for {len(best_allele_df)} peptides."
- )
-
- returnbest_allele_df
-
-
-
-[docs]
- defpredictions_to_dataframe(self)->pd.DataFrame:
-"""
- Convert the predictions to a DataFrame.
-
- Returns
- -------
- pd.DataFrame
- DataFrame containing the predictions.
-
- Raises
- ------
- ValueError
- If no predictions are available.
- """
- ifself.predictionsisNone:
- raiseValueError(
- "No predictions available. Please run 'generate_features' first."
- )
- returnself.predictions
Source code for optimhc.feature_generator.netMHCIIpan
-# feature_generator/netMHCIIpan.py
-
-# TODO: set 'BA' and 'EL' as optional parameters for the user to choose the prediction method.
-
-fromoptimhc.feature_generator.base_feature_generatorimportBaseFeatureGenerator
-importpandasaspd
-frommhctoolsimportNetMHCIIpan43_BA,NetMHCIIpan43_EL
-fromtypingimportList,Dict,Optional
-fromoptimhcimportutils
-importlogging
-frommultiprocessingimportPool,cpu_count
-fromfunctoolsimportpartial
-fromtqdmimporttqdm
-
-logger=logging.getLogger(__name__)
-
-
-
-[docs]
-def_predict_peptide_chunk_class2(
- peptides_chunk:List[str],alleles:List[str]
-)->pd.DataFrame:
-"""
- Use NetMHCIIpan43_BA to predict a batch of peptides (MHC Class II).
-
- Parameters:
- peptides_chunk (List[str]): A batch of peptide sequences to predict.
- alleles (List[str]): List of MHC Class II alleles, e.g., ['DRB1_0101', 'DRB1_0102'].
-
- Returns:
- pd.DataFrame: A DataFrame containing prediction results.
- """
- predictor=NetMHCIIpan43_BA(alleles=alleles)
- results=predictor.predict_peptides(peptides_chunk)
- returnresults.to_dataframe()
-
-
-
-
-[docs]
-classNetMHCIIpanFeatureGenerator(BaseFeatureGenerator):
-"""
- Generate NetMHCIIpan features for given peptides based on specified MHC Class II alleles.
-
- This feature generator uses the NetMHCIIpan43_BA interface to predict MHC Class II binding
- for each peptide and returns scores and features based on the specified parameters.
-
- Parameters
- ----------
- peptides : List[str]
- List of peptide sequences.
- alleles : List[str]
- List of MHC Class II alleles, e.g., ['DRB1_0101', 'DRB1_0102'].
- mode : str, optional
- Feature generation mode. Options:
- - 'best': Return only the best result for each peptide across all alleles.
- - 'all': Return prediction results for each peptide across all alleles (with allele-specific column suffixes).
- Default is 'best'.
- remove_pre_nxt_aa : bool, optional
- Whether to remove the amino acids flanking the peptide (e.g., removing X-AA/AA-X forms).
- Default is True.
- remove_modification : bool, optional
- Whether to remove modification information from peptides, e.g., (Phospho).
- Default is True.
- n_processes : int, optional
- Number of processes to use. Default is 1 (no multiprocessing).
- show_progress : bool, optional
- Whether to display a progress bar. Default is False.
-
- Notes
- -----
- The generated features include:
- - netmhciipan_score: Raw binding score
- - netmhciipan_affinity: Binding affinity in nM
- - netmhciipan_percentile_rank: Percentile rank of the binding score
- """
-
- MIN_PEPTIDE_LENGTH=9# Minimum peptide length for MHC Class II is usually 9
- MAX_PEPTIDE_LENGTH=50# Can be adjusted based on use case
- CHUNKSIZE=500
-
- def__init__(
- self,
- peptides:List[str],
- alleles:List[str],
- mode:str="best",
- remove_pre_nxt_aa:bool=True,
- remove_modification:bool=True,
- n_processes:int=1,
- show_progress:bool=False,
- *args,
- **kwargs,
- ):
- ifmodenotin["best","all"]:
- raiseValueError("Mode must be one of 'best' or 'all'.")
-
- self.peptides=peptides
- self.alleles=alleles
- self.mode=mode
- iflen(alleles)==1:
- self.mode="best"
- logger.info("Only one allele provided. Switching to 'best' mode.")
-
- self.remove_pre_nxt_aa=remove_pre_nxt_aa
- self.remove_modification=remove_modification
- self.n_processes=min(n_processes,cpu_count())
- self.show_progress=show_progress
- self.predictor=NetMHCIIpan43_BA(alleles=self.alleles)
- self.predictions=None
- self._raw_predictions=None
-
- logger.info(
- f"Initialized NetMHCIIpanFeatureGenerator with {len(peptides)} peptides, "
- f"alleles={alleles}, mode={self.mode}, "
- f"n_processes={self.n_processes}, show_progress={self.show_progress}"
- )
-
- @property
- deffeature_columns(self)->List[str]:
-"""
- Return the list of generated feature column names, determined by the mode.
- Only includes numerical features, excluding any string features like allele names.
-
- Returns
- -------
- List[str]
- List of feature column names:
- - For 'all' mode: netmhciipan_score_{allele}, netmhciipan_affinity_{allele},
- netmhciipan_percentile_rank_{allele} for each allele
- - For both modes: netmhciipan_best_score, netmhciipan_best_affinity,
- netmhciipan_best_percentile_rank
- """
- columns=[]
- ifself.mode=="all":
- allele_specific=[]
- foralleleinself.alleles:
- allele_specific.extend(
- [
- f"netmhciipan_score_{allele}",
- f"netmhciipan_affinity_{allele}",
- f"netmhciipan_percentile_rank_{allele}",
- ]
- )
- columns.extend(allele_specific)
-
- # Both 'best' and 'all' modes include best allele numerical information
- columns.extend(
- [
- "netmhciipan_best_score",
- "netmhciipan_best_affinity",
- "netmhciipan_best_percentile_rank",
- ]
- )
- returncolumns
-
- @property
- defid_column(self)->List[str]:
-"""
- Return the list of input columns required for the feature generator.
-
- Returns
- -------
- List[str]
- List of input column names.
- """
- return["Peptide"]
-
-
-[docs]
- def_preprocess_peptides(self,peptide:str)->str:
-"""
- Preprocess the input peptide by removing flanking amino acids,
- modifications, and replacing non-standard amino acids.
-
- Parameters
- ----------
- peptide : str
- The original peptide sequence.
-
- Returns
- -------
- str
- The preprocessed peptide sequence.
- """
- ifself.remove_pre_nxt_aa:
- peptide=utils.remove_pre_and_nxt_aa(peptide)
- ifself.remove_modification:
- peptide=utils.remove_modifications(peptide)
- # Replace any non-standard amino acids if necessary
- peptide=peptide.replace("U","C")
- returnpeptide
-
-
-
-[docs]
- def_predict_multiprocessing(self,peptides_to_predict:List[str])->pd.DataFrame:
-"""
- Run NetMHCIIpan predictions using multiprocessing.
-
- Parameters
- ----------
- peptides_to_predict : List[str]
- List of peptides to predict.
-
- Returns
- -------
- pd.DataFrame
- DataFrame containing the prediction results:
- - peptide: Peptide sequence
- - allele: MHC allele
- - score: Raw binding score
- - affinity: Binding affinity in nM
- - percentile_rank: Percentile rank
-
- Notes
- -----
- This method:
- 1. Splits peptides into chunks
- 2. Processes chunks in parallel
- 3. Combines results into a single DataFrame
- """
- logger.info("Running NetMHCIIpan predictions with multiprocessing.")
- chunksize=min(
- NetMHCIIpanFeatureGenerator.CHUNKSIZE,
- max(1,len(peptides_to_predict)//self.n_processes),
- )
- func=partial(_predict_peptide_chunk_class2,alleles=self.alleles)
-
- withPool(processes=self.n_processes)aspool:
- ifself.show_progress:
- results=list(
- tqdm(
- pool.imap(
- func,
- [
- peptides_to_predict[i:i+chunksize]
- foriinrange(0,len(peptides_to_predict),chunksize)
- ],
- ),
- total=(len(peptides_to_predict)+chunksize-1)//chunksize,
- desc="Predicting NetMHCIIpan",
- )
- )
- else:
- results=pool.map(
- func,
- [
- peptides_to_predict[i:i+chunksize]
- foriinrange(0,len(peptides_to_predict),chunksize)
- ],
- )
-
- netmhciipan_results=pd.concat(results,ignore_index=True)
- # Save the raw prediction results
- self._raw_predictions=netmhciipan_results.copy()
- logger.info(
- f"Completed multiprocessing predictions for {len(peptides_to_predict)} peptides."
- )
- returnnetmhciipan_results
-
-
-
-[docs]
- def_predict(self)->pd.DataFrame:
-"""
- Run NetMHCIIpan predictions and cache the result.
-
- Returns
- -------
- pd.DataFrame
- DataFrame containing the prediction results:
- - Peptide: Original peptide sequence
- - peptide: Cleaned peptide sequence
- - allele: MHC allele
- - score: Raw binding score
- - affinity: Binding affinity in nM
- - percentile_rank: Percentile rank
-
- Notes
- -----
- This method:
- 1. Preprocesses peptides
- 2. Filters peptides by length (9-30 amino acids)
- 3. Runs predictions (with or without multiprocessing)
- 4. Merges results with original peptides
- """
- ifself.predictionsisnotNone:
- logger.info("NetMHCIIpan predictions already exist. Skipping prediction.")
- returnself.predictions
-
- logger.info("Starting NetMHCIIpan predictions.")
- self.predictions=pd.DataFrame(self.peptides,columns=["Peptide"])
- self.predictions["clean_peptide"]=self.predictions["Peptide"].apply(
- self._preprocess_peptides
- )
-
- # Filter peptides that meet the length requirements
- peptides_to_predict=(
- self.predictions[
- self.predictions["clean_peptide"].apply(
- lambdax:NetMHCIIpanFeatureGenerator.MIN_PEPTIDE_LENGTH
- <=len(x)
- <=NetMHCIIpanFeatureGenerator.MAX_PEPTIDE_LENGTH
- )
- ]["clean_peptide"]
- .unique()
- .tolist()
- )
-
- logger.info(
- f"Found {len(peptides_to_predict)} unique peptides meeting the length requirements."
- )
-
- ifself.n_processes>1:
- netmhciipan_results=self._predict_multiprocessing(peptides_to_predict)
- else:
- netmhciipan_results=self.predictor.predict_peptides(
- peptides_to_predict
- ).to_dataframe()
- # If not using multiprocessing, save raw prediction results here
- self._raw_predictions=netmhciipan_results.copy()
-
- logger.info(
- f"Predicted NetMHCIIpan results for {len(netmhciipan_results)} peptides."
- )
-
- self.predictions=self.predictions.merge(
- netmhciipan_results,left_on="clean_peptide",right_on="peptide",how="left"
- )
- self.predictions.drop(columns=["clean_peptide"],inplace=True)
-
- logger.info(
- f"Completed NetMHCIIpan predictions for {len(peptides_to_predict)} peptides."
- )
- returnself.predictions
-
-
-
-[docs]
- defgenerate_features(self)->pd.DataFrame:
-"""
- Generate the final feature table with NetMHCIIpan features for each peptide.
-
- Returns
- -------
- pd.DataFrame
- DataFrame containing peptides and their predicted features:
- - Peptide: Original peptide sequence
- - For 'all' mode: netmhciipan_score_{allele}, netmhciipan_affinity_{allele},
- netmhciipan_percentile_rank_{allele} for each allele
- - For both modes: netmhciipan_best_score, netmhciipan_best_affinity,
- netmhciipan_best_percentile_rank
-
- Notes
- -----
- The features generated depend on the mode:
- - 'best': Only the best allele information for each peptide
- - 'all': All allele predictions plus best allele information
-
- Missing values are handled consistently by filling with median values
- for numeric columns.
- """
- predictions_df=self._predict()
-
- features_df=pd.DataFrame({"Peptide":self.peptides})
-
- # Generate allele-specific features if mode is 'all', otherwise generate best allele features
- ifself.mode=="all":
- features_df=self._generate_all_allele_features(
- predictions_df,features_df
- )
- features_df=self._generate_best_allele_features(predictions_df,features_df)
-
- features_df=self._fill_missing_values(features_df)
-
- selected_columns=["Peptide"]+self.feature_columns
- logger.info(f"Final selected feature columns: {selected_columns}")
- features_df=features_df[selected_columns]
-
- iffeatures_df.isna().sum().sum()>0:
- logger.warning(
- "NaN values still exist in the generated features after filling with median/mode values."
- )
-
- returnfeatures_df
-
-
-
-[docs]
- def_generate_all_allele_features(
- self,predictions_df:pd.DataFrame,features_df:pd.DataFrame
- )->pd.DataFrame:
-"""
- Generate features for all alleles.
-
- Parameters
- ----------
- predictions_df : pd.DataFrame
- The predictions DataFrame.
- features_df : pd.DataFrame
- The features DataFrame to update.
-
- Returns
- -------
- pd.DataFrame
- Updated features DataFrame with all allele features:
- - Peptide: Original peptide sequence
- - netmhciipan_score_{allele}: Raw binding score for each allele
- - netmhciipan_affinity_{allele}: Binding affinity for each allele
- - netmhciipan_percentile_rank_{allele}: Percentile rank for each allele
- """
- logger.info("Generating features for all alleles.")
-
- foralleleinself.alleles:
- logger.info(f"Adding scores for allele {allele}.")
- allele_df=predictions_df[predictions_df["allele"]==allele].copy()
-
- ifallele_df.empty:
- logger.warning(
- f"No prediction results found for allele {allele}. Filling with NaN."
- )
- allele_features=pd.DataFrame(
- {
- "Peptide":self.peptides,
- f"netmhciipan_score_{allele}":[pd.NA]*len(self.peptides),
- f"netmhciipan_affinity_{allele}":[pd.NA]*len(self.peptides),
- f"netmhciipan_percentile_rank_{allele}":[pd.NA]
- *len(self.peptides),
- }
- )
- else:
- allele_df=allele_df.rename(
- columns={
- "score":f"netmhciipan_score_{allele}",
- "affinity":f"netmhciipan_affinity_{allele}",
- "percentile_rank":f"netmhciipan_percentile_rank_{allele}",
- }
- )
-
- allele_features=allele_df[
- [
- "Peptide",
- f"netmhciipan_score_{allele}",
- f"netmhciipan_affinity_{allele}",
- f"netmhciipan_percentile_rank_{allele}",
- ]
- ]
-
- features_df=features_df.merge(allele_features,on="Peptide",how="left")
-
- logger.info("Added scores for all alleles.")
- returnfeatures_df
-
-
-
-[docs]
- def_generate_best_allele_features(
- self,predictions_df:pd.DataFrame,features_df:pd.DataFrame
- )->pd.DataFrame:
-"""
- Generate features for the best allele.
-
- Parameters
- ----------
- predictions_df : pd.DataFrame
- The predictions DataFrame.
- features_df : pd.DataFrame
- The features DataFrame to update.
-
- Returns
- -------
- pd.DataFrame
- Updated features DataFrame with best allele features:
- - Peptide: Original peptide sequence
- - netmhciipan_best_allele: Best binding allele
- - netmhciipan_best_score: Best binding score
- - netmhciipan_best_affinity: Best binding affinity
- - netmhciipan_best_percentile_rank: Best percentile rank
-
- Notes
- -----
- The best allele is determined by the lowest percentile rank.
- """
- logger.info("Generating features for best allele.")
-
- valid_predictions=predictions_df.dropna(subset=["percentile_rank"])
-
- ifvalid_predictions.empty:
- logger.warning("No valid predictions available to select the best allele.")
- best_allele_features=pd.DataFrame(
- {
- "Peptide":self.peptides,
- "netmhciipan_best_allele":["Unknown"]*len(self.peptides),
- "netmhciipan_best_score":[pd.NA]*len(self.peptides),
- "netmhciipan_best_affinity":[pd.NA]*len(self.peptides),
- "netmhciipan_best_percentile_rank":[pd.NA]*len(self.peptides),
- }
- )
- else:
- # Find the index of the minimum percentile rank for each peptide
- idx=valid_predictions.groupby("Peptide")["percentile_rank"].idxmin()
-
- best_allele_features=valid_predictions.loc[idx].rename(
- columns={
- "allele":"netmhciipan_best_allele",
- "score":"netmhciipan_best_score",
- "affinity":"netmhciipan_best_affinity",
- "percentile_rank":"netmhciipan_best_percentile_rank",
- }
- )
-
- best_allele_features=best_allele_features[
- [
- "Peptide",
- "netmhciipan_best_allele",
- "netmhciipan_best_score",
- "netmhciipan_best_affinity",
- "netmhciipan_best_percentile_rank",
- ]
- ]
-
- # Handle missing peptides
- missing_peptides=set(self.peptides)-set(best_allele_features["Peptide"])
-
- ifmissing_peptides:
- logger.warning(
- f"Found {len(missing_peptides)} peptides with no best allele prediction."
- )
- missing_features=pd.DataFrame(
- {
- "Peptide":list(missing_peptides),
- "netmhciipan_best_allele":["Unknown"]*len(missing_peptides),
- "netmhciipan_best_score":[pd.NA]*len(missing_peptides),
- "netmhciipan_best_affinity":[pd.NA]*len(missing_peptides),
- "netmhciipan_best_percentile_rank":[pd.NA]
- *len(missing_peptides),
- }
- )
- best_allele_features=pd.concat(
- [best_allele_features,missing_features],ignore_index=True
- )
-
- features_df=features_df.merge(best_allele_features,on="Peptide",how="left")
- logger.info("Added best allele information.")
-
- returnfeatures_df
-
-
-
-[docs]
- def_fill_missing_values(self,features_df:pd.DataFrame)->pd.DataFrame:
-"""
- Fill missing values in the features DataFrame.
-
- Parameters
- ----------
- features_df : pd.DataFrame
- The features DataFrame to update.
-
- Returns
- -------
- pd.DataFrame
- Updated features DataFrame with filled missing values.
-
- Notes
- -----
- This method:
- 1. Fills best allele string values with 'Unknown'
- 2. Fills numeric values with median for all allele features
- 3. Fills numeric values for best allele features with median
- """
- logger.info("Filling missing values in the features DataFrame.")
-
- # Fill best allele string values
- if"netmhciipan_best_allele"infeatures_df.columns:
- features_df["netmhciipan_best_allele"].fillna("Unknown",inplace=True)
-
- # Fill numeric values with median for all allele features
- ifself.mode=="all":
- foralleleinself.alleles:
- formetricin["score","affinity","percentile_rank"]:
- col=f"netmhciipan_{metric}_{allele}"
- ifcolinfeatures_df.columns:
- median_value=features_df[col].median()
- features_df[col].fillna(median_value,inplace=True)
-
- # Fill numeric values for best allele features
- formetricin["best_score","best_affinity","best_percentile_rank"]:
- col=f"netmhciipan_{metric}"
- ifcolinfeatures_df.columnsandfeatures_df[col].isna().any():
- median_value=features_df[col].median()
- # If all values are NA, median will be NA, so use 0 instead
- median_value=0ifpd.isna(median_value)elsemedian_value
- features_df[col].fillna(median_value,inplace=True)
-
- logger.info("Filled missing values in the features DataFrame.")
- returnfeatures_df
-
-
-
-[docs]
- defpredictions_to_dataframe(self)->pd.DataFrame:
-"""
- Convert the predictions to a DataFrame.
-
- Returns
- -------
- pd.DataFrame
- DataFrame containing the predictions.
-
- Raises
- ------
- ValueError
- If no predictions are available.
- """
- ifself.predictionsisNone:
- raiseValueError(
- "No predictions available. Please run 'generate_features' first."
- )
- returnself.predictions
-[docs]
- defget_raw_predictions(self)->pd.DataFrame:
-"""
- Get the raw prediction results DataFrame from NetMHCIIpan.
-
- Returns
- -------
- pd.DataFrame
- Raw prediction results DataFrame containing:
- - peptide: Cleaned peptide sequence
- - allele: MHC allele
- - score: Raw binding score
- - affinity: Binding affinity in nM
- - percentile_rank: Percentile rank
- """
- returnself.raw_predictions
-
-
-
-[docs]
- defsave_raw_predictions(self,file_path:str,**kwargs)->None:
-"""
- Save the raw prediction results to a file.
-
- Parameters
- ----------
- file_path : str
- Path to save the file.
- **kwargs : dict
- Additional parameters passed to pandas.DataFrame.to_csv.
- If 'index' is not specified, it defaults to False.
-
- Notes
- -----
- This method saves the raw predictions DataFrame to a CSV file.
- The DataFrame includes:
- - peptide: Cleaned peptide sequence
- - allele: MHC allele
- - score: Raw binding score
- - affinity: Binding affinity in nM
- - percentile_rank: Percentile rank
- """
- if"index"notinkwargs:
- kwargs["index"]=False
- ifself.raw_predictionsisnotNone:
- self.raw_predictions.to_csv(file_path,**kwargs)
- logger.info(f"Raw prediction results saved to: {file_path}")
- else:
- logger.warning("No raw prediction results available to save.")
Source code for optimhc.feature_generator.netMHCpan
-# feature_generators/netmhcpan_feature_generator.py
-
-# TODO: Except 'best' mode, the other modes seems to be not working properly. We need to investigate this issue.
-
-from.base_feature_generatorimportBaseFeatureGenerator
-importpandasaspd
-frommhctoolsimportNetMHCpan41
-fromtypingimportList,Dict,Optional
-fromoptimhcimportutils
-importlogging
-frommultiprocessingimportPool,cpu_count
-fromfunctoolsimportpartial
-fromtqdmimporttqdm
-
-logger=logging.getLogger(__name__)
-
-# Helper function for multiprocessing
-
-[docs]
-def_predict_peptide_chunk(
- peptides_chunk:List[str],alleles:List[str]
-)->pd.DataFrame:
-"""
- Predict NetMHCpan scores for a chunk of peptides.
-
- Parameters
- ----------
- peptides_chunk : List[str]
- List of peptide sequences.
- alleles : List[str]
- List of MHC allele names.
-
- Returns
- -------
- pd.DataFrame
- DataFrame containing predictions:
- - peptide: Peptide sequence
- - allele: MHC allele
- - score: Raw binding score
- - affinity: Binding affinity in nM
- - percentile_rank: Percentile rank
- """
- predictor=NetMHCpan41(alleles=alleles)
- results=predictor.predict_peptides(peptides_chunk)
- returnresults.to_dataframe()
-
-
-
-
-[docs]
-classNetMHCpanFeatureGenerator(BaseFeatureGenerator):
-"""
- Generate NetMHCpan features for peptides based on specified MHC class I alleles.
-
- This generator calculates NetMHCpan binding predictions for each peptide against
- the provided MHC class I alleles.
-
- Parameters
- ----------
- peptides : List[str]
- List of peptide sequences.
- alleles : List[str]
- List of MHC allele names (e.g., ['HLA-A*02:01', 'HLA-B*07:02']).
- mode : str, optional
- Mode of feature generation. Options:
- - 'best': Return only the best allele information for each peptide.
- - 'all': Return predictions for all alleles with allele-specific suffixes plus best allele info.
- Default is 'best'.
- remove_pre_nxt_aa : bool, optional
- Whether to include the previous and next amino acids in peptides.
- If True, remove them. Default is True.
- remove_modification : bool, optional
- Whether to include modifications in peptides.
- If True, remove them. Default is True.
- n_processes : int, optional
- Number of processes to use for multiprocessing.
- Default is 1 (no multiprocessing).
- show_progress : bool, optional
- Whether to display a progress bar. Default is False.
-
- Notes
- -----
- The generated features include:
- - netmhcpan_score: Raw binding score
- - netmhcpan_affinity: Binding affinity in nM
- - netmhcpan_percentile_rank: Percentile rank of the binding score
- """
-
- MIN_PEPTIDE_LENGTH=8
- MAX_PEPTIDE_LENGTH=30
- CHUNKSIZE=250
-
- def__init__(
- self,
- peptides:List[str],
- alleles:List[str],
- mode:str="best",
- remove_pre_nxt_aa:bool=False,
- remove_modification:bool=True,
- n_processes:int=1,
- show_progress:bool=False,
- *args,
- **kwargs,
- ):
- ifmodenotin["best","all"]:
- raiseValueError("Mode must be one of 'best' or 'all'.")
-
- self.peptides=peptides
- self.alleles=alleles
- self.mode=mode
- iflen(alleles)==1:
- self.mode="best"
- logger.info("Only one allele provided. Switching to 'best' mode.")
- self.remove_pre_nxt_aa=remove_pre_nxt_aa
- self.remove_modification=remove_modification
- self.n_processes=min(n_processes,cpu_count())
- self.show_progress=show_progress
- self.predictor=NetMHCpan41(alleles=self.alleles)
- self.predictions=None
- self._raw_predictions=None
- logger.info(
- f"Initialized NetMHCpanFeatureGenerator with {len(peptides)} peptides, alleles: {alleles}, mode: {mode}, n_processes: {self.n_processes}, show_progress: {self.show_progress}"
- )
-
- @property
- deffeature_columns(self)->List[str]:
-"""
- Return the list of generated feature column names, determined by the mode.
- Only includes numerical features, excluding any string features like allele names.
-
- Returns
- -------
- List[str]
- List of feature column names:
- - For 'all' mode: netmhcpan_score_{allele}, netmhcpan_affinity_{allele},
- netmhcpan_percentile_rank_{allele} for each allele
- - For both modes: netmhcpan_best_score, netmhcpan_best_affinity,
- netmhcpan_best_percentile_rank
- """
- columns=[]
- ifself.mode=="all":
- allele_specific=[]
- foralleleinself.alleles:
- allele_specific.extend(
- [
- f"netmhcpan_score_{allele}",
- f"netmhcpan_affinity_{allele}",
- f"netmhcpan_percentile_rank_{allele}",
- ]
- )
- columns.extend(allele_specific)
-
- # Both 'best' and 'all' modes include best allele numerical information
- columns.extend(
- [
- "netmhcpan_best_score",
- "netmhcpan_best_affinity",
- "netmhcpan_best_percentile_rank",
- ]
- )
- returncolumns
-
- @property
- defid_column(self)->List[str]:
-"""
- Return the list of input columns required for the feature generator.
-
- Returns
- -------
- List[str]
- List of input column names.
- """
- return["Peptide"]
-
-
-[docs]
- def_preprocess_peptides(self,peptide:str)->str:
-"""
- Preprocess the input peptide by removing flanking amino acids,
- modifications, and replacing non-standard amino acids.
-
- Parameters
- ----------
- peptide : str
- The original peptide sequence.
-
- Returns
- -------
- str
- The preprocessed peptide sequence.
- """
- ifself.remove_pre_nxt_aa:
- peptide=utils.remove_pre_and_nxt_aa(peptide)
- ifself.remove_modification:
- peptide=utils.remove_modifications(peptide)
- # Replace any non-standard amino acids if necessary
- peptide=peptide.replace("U","C")
- returnpeptide
-
-
-
-[docs]
- def_predict_multiprocessing(self,peptides_to_predict:List[str])->pd.DataFrame:
-"""
- Run NetMHCpan predictions using multiprocessing.
-
- Parameters
- ----------
- peptides_to_predict : List[str]
- List of peptides to predict.
-
- Returns
- -------
- pd.DataFrame
- DataFrame containing the prediction results:
- - peptide: Peptide sequence
- - allele: MHC allele
- - score: Raw binding score
- - affinity: Binding affinity in nM
- - percentile_rank: Percentile rank
-
- Notes
- -----
- This method:
- 1. Splits peptides into chunks
- 2. Processes chunks in parallel
- 3. Combines results into a single DataFrame
- """
- logger.info("Running NetMHCpan predictions with multiprocessing.")
- chunksize=min(
- NetMHCpanFeatureGenerator.CHUNKSIZE,
- max(1,len(peptides_to_predict)//self.n_processes),
- )
- func=partial(_predict_peptide_chunk,alleles=self.alleles)
-
- withPool(processes=self.n_processes)aspool:
- ifself.show_progress:
- results=list(
- tqdm(
- pool.imap(
- func,
- [
- peptides_to_predict[i:i+chunksize]
- foriinrange(0,len(peptides_to_predict),chunksize)
- ],
- ),
- total=(len(peptides_to_predict)+chunksize-1)//chunksize,
- desc="Predicting NetMHCpan",
- )
- )
- else:
- results=pool.map(
- func,
- [
- peptides_to_predict[i:i+chunksize]
- foriinrange(0,len(peptides_to_predict),chunksize)
- ],
- )
-
- netmhcpan_results=pd.concat(results,ignore_index=True)
- # Save the raw prediction results
- self._raw_predictions=netmhcpan_results.copy()
- logger.info(
- f"Completed multiprocessing predictions for {len(peptides_to_predict)} peptides."
- )
- returnnetmhcpan_results
-
-
-
-[docs]
- def_predict(self)->pd.DataFrame:
-"""
- Run NetMHCpan predictions and cache the result.
-
- Returns
- -------
- pd.DataFrame
- DataFrame containing the prediction results:
- - Peptide: Original peptide sequence
- - peptide: Cleaned peptide sequence
- - allele: MHC allele
- - score: Raw binding score
- - affinity: Binding affinity in nM
- - percentile_rank: Percentile rank
-
- Notes
- -----
- This method:
- 1. Preprocesses peptides
- 2. Filters peptides by length (8-30 amino acids)
- 3. Runs predictions (with or without multiprocessing)
- 4. Merges results with original peptides
- """
- ifself.predictionsisnotNone:
- logger.info("NetMHCpan predictions already exist. Skipping prediction.")
- returnself.predictions
-
- logger.info("Starting NetMHCpan predictions.")
- self.predictions=pd.DataFrame(self.peptides,columns=["Peptide"])
- self.predictions["clean_peptide"]=self.predictions["Peptide"].apply(
- self._preprocess_peptides
- )
-
- # Filter peptides that meet the length requirements
- peptides_to_predict=(
- self.predictions[
- self.predictions["clean_peptide"].apply(
- lambdax:NetMHCpanFeatureGenerator.MIN_PEPTIDE_LENGTH
- <=len(x)
- <=NetMHCpanFeatureGenerator.MAX_PEPTIDE_LENGTH
- )
- ]["clean_peptide"]
- .unique()
- .tolist()
- )
-
- logger.info(
- f"Found {len(peptides_to_predict)} unique peptides meeting the length requirements."
- )
-
- ifself.n_processes>1:
- netmhcpan_results=self._predict_multiprocessing(peptides_to_predict)
- else:
- netmhcpan_results=self.predictor.predict_peptides(
- peptides_to_predict
- ).to_dataframe()
- # If not using multiprocessing, save raw prediction results here
- self._raw_predictions=netmhcpan_results.copy()
-
- logger.info(
- f"Predicted NetMHCpan results for {len(netmhcpan_results)} peptides."
- )
-
- self.predictions=self.predictions.merge(
- netmhcpan_results,left_on="clean_peptide",right_on="peptide",how="left"
- )
- self.predictions.drop(columns=["clean_peptide"],inplace=True)
-
- logger.info(
- f"Completed NetMHCpan predictions for {len(peptides_to_predict)} peptides."
- )
- returnself.predictions
-[docs]
- defget_raw_predictions(self)->pd.DataFrame:
-"""
- Get the raw prediction results DataFrame from NetMHCpan.
-
- Returns
- -------
- pd.DataFrame
- Raw prediction results DataFrame containing:
- - peptide: Cleaned peptide sequence
- - allele: MHC allele
- - score: Raw binding score
- - affinity: Binding affinity in nM
- - percentile_rank: Percentile rank
- """
- returnself.raw_predictions
-
-
-
-[docs]
- defsave_raw_predictions(self,file_path:str,**kwargs)->None:
-"""
- Save the raw prediction results to a file.
-
- Parameters
- ----------
- file_path : str
- Path to save the file.
- **kwargs : dict
- Additional parameters passed to pandas.DataFrame.to_csv.
- If 'index' is not specified, it defaults to False.
-
- Notes
- -----
- This method saves the raw predictions DataFrame to a CSV file.
- The DataFrame includes:
- - peptide: Cleaned peptide sequence
- - allele: MHC allele
- - score: Raw binding score
- - affinity: Binding affinity in nM
- - percentile_rank: Percentile rank
- """
- if"index"notinkwargs:
- kwargs["index"]=False
- ifself.raw_predictionsisnotNone:
- self.raw_predictions.to_csv(file_path,**kwargs)
- logger.info(f"Raw prediction results saved to: {file_path}")
- else:
- logger.warning("No raw prediction results available to save.")
-
-
-
-[docs]
- defgenerate_features(self)->pd.DataFrame:
-"""
- Generate the final feature table with NetMHCpan features for each peptide.
-
- Returns
- -------
- pd.DataFrame
- DataFrame containing peptides and their predicted features:
- - Peptide: Original peptide sequence
- - For 'all' mode: netmhcpan_score_{allele}, netmhcpan_affinity_{allele},
- netmhcpan_percentile_rank_{allele} for each allele
- - For both modes: netmhcpan_best_score, netmhcpan_best_affinity,
- netmhcpan_best_percentile_rank
-
- Notes
- -----
- The features generated depend on the mode:
- - 'best': Only the best allele information for each peptide
- - 'all': All allele predictions plus best allele information
-
- Missing values are handled consistently by filling with median values
- for numeric columns.
- """
- predictions_df=self._predict()
-
- features_df=pd.DataFrame({"Peptide":self.peptides})
-
- # Generate allele-specific features if mode is 'all', otherwise generate best allele features
- ifself.mode=="all":
- features_df=self._generate_all_allele_features(
- predictions_df,features_df
- )
- features_df=self._generate_best_allele_features(predictions_df,features_df)
-
- features_df=self._fill_missing_values(features_df)
-
- selected_columns=["Peptide"]+self.feature_columns
- logger.info(f"Final selected feature columns: {selected_columns}")
- features_df=features_df[selected_columns]
-
- iffeatures_df.isna().sum().sum()>0:
- logger.warning(
- "NaN values still exist in the generated features after filling with median/mode values."
- )
-
- returnfeatures_df
-
-
-
-[docs]
- def_generate_all_allele_features(
- self,predictions_df:pd.DataFrame,features_df:pd.DataFrame
- )->pd.DataFrame:
-"""
- Generate features for all alleles.
-
- Parameters
- ----------
- predictions_df : pd.DataFrame
- The predictions DataFrame.
- features_df : pd.DataFrame
- The features DataFrame to update.
-
- Returns
- -------
- pd.DataFrame
- Updated features DataFrame with all allele features:
- - Peptide: Original peptide sequence
- - netmhcpan_score_{allele}: Raw binding score for each allele
- - netmhcpan_affinity_{allele}: Binding affinity for each allele
- - netmhcpan_percentile_rank_{allele}: Percentile rank for each allele
- """
- logger.info("Generating features for all alleles.")
-
- foralleleinself.alleles:
- logger.info(f"Adding scores for allele {allele}.")
- allele_df=predictions_df[predictions_df["allele"]==allele].copy()
-
- ifallele_df.empty:
- logger.warning(
- f"No prediction results found for allele {allele}. Filling with NaN."
- )
- allele_features=pd.DataFrame(
- {
- "Peptide":self.peptides,
- f"netmhcpan_score_{allele}":[pd.NA]*len(self.peptides),
- f"netmhcpan_affinity_{allele}":[pd.NA]*len(self.peptides),
- f"netmhcpan_percentile_rank_{allele}":[pd.NA]
- *len(self.peptides),
- }
- )
- else:
- allele_df=allele_df.rename(
- columns={
- "score":f"netmhcpan_score_{allele}",
- "affinity":f"netmhcpan_affinity_{allele}",
- "percentile_rank":f"netmhcpan_percentile_rank_{allele}",
- }
- )
-
- allele_features=allele_df[
- [
- "Peptide",
- f"netmhcpan_score_{allele}",
- f"netmhcpan_affinity_{allele}",
- f"netmhcpan_percentile_rank_{allele}",
- ]
- ]
-
- features_df=features_df.merge(allele_features,on="Peptide",how="left")
-
- logger.info("Added scores for all alleles.")
- returnfeatures_df
-
-
-
-[docs]
- def_generate_best_allele_features(
- self,predictions_df:pd.DataFrame,features_df:pd.DataFrame
- )->pd.DataFrame:
-"""
- Generate features for the best allele.
-
- Parameters
- ----------
- predictions_df : pd.DataFrame
- The predictions DataFrame.
- features_df : pd.DataFrame
- The features DataFrame to update.
-
- Returns
- -------
- pd.DataFrame
- Updated features DataFrame with best allele features:
- - Peptide: Original peptide sequence
- - netmhcpan_best_allele: Best binding allele
- - netmhcpan_best_score: Best binding score
- - netmhcpan_best_affinity: Best binding affinity
- - netmhcpan_best_percentile_rank: Best percentile rank
-
- Notes
- -----
- The best allele is determined by the lowest percentile rank.
- """
- logger.info("Generating features for best allele.")
-
- valid_predictions=predictions_df.dropna(subset=["percentile_rank"])
-
- ifvalid_predictions.empty:
- logger.warning("No valid predictions available to select the best allele.")
- best_allele_features=pd.DataFrame(
- {
- "Peptide":self.peptides,
- "netmhcpan_best_allele":["Unknown"]*len(self.peptides),
- "netmhcpan_best_score":[pd.NA]*len(self.peptides),
- "netmhcpan_best_affinity":[pd.NA]*len(self.peptides),
- "netmhcpan_best_percentile_rank":[pd.NA]*len(self.peptides),
- }
- )
- else:
- # Find the index of the minimum percentile rank for each peptide
- idx=valid_predictions.groupby("Peptide")["percentile_rank"].idxmin()
-
- best_allele_features=valid_predictions.loc[idx].rename(
- columns={
- "allele":"netmhcpan_best_allele",
- "score":"netmhcpan_best_score",
- "affinity":"netmhcpan_best_affinity",
- "percentile_rank":"netmhcpan_best_percentile_rank",
- }
- )
-
- best_allele_features=best_allele_features[
- [
- "Peptide",
- "netmhcpan_best_allele",
- "netmhcpan_best_score",
- "netmhcpan_best_affinity",
- "netmhcpan_best_percentile_rank",
- ]
- ]
-
- # Handle missing peptides
- missing_peptides=set(self.peptides)-set(best_allele_features["Peptide"])
-
- ifmissing_peptides:
- logger.warning(
- f"Found {len(missing_peptides)} peptides with no best allele prediction."
- )
- missing_features=pd.DataFrame(
- {
- "Peptide":list(missing_peptides),
- "netmhcpan_best_allele":["Unknown"]*len(missing_peptides),
- "netmhcpan_best_score":[pd.NA]*len(missing_peptides),
- "netmhcpan_best_affinity":[pd.NA]*len(missing_peptides),
- "netmhcpan_best_percentile_rank":[pd.NA]
- *len(missing_peptides),
- }
- )
- best_allele_features=pd.concat(
- [best_allele_features,missing_features],ignore_index=True
- )
-
- features_df=features_df.merge(best_allele_features,on="Peptide",how="left")
- logger.info("Added best allele information.")
-
- returnfeatures_df
-
-
-
-[docs]
- def_fill_missing_values(self,features_df:pd.DataFrame)->pd.DataFrame:
-"""
- Fill missing values in the features DataFrame.
-
- Parameters
- ----------
- features_df : pd.DataFrame
- The features DataFrame to update.
-
- Returns
- -------
- pd.DataFrame
- Updated features DataFrame with filled missing values.
-
- Notes
- -----
- This method:
- 1. Fills best allele string values with 'Unknown'
- 2. Fills numeric values with median for all allele features
- 3. Fills numeric values for best allele features with median
- """
- logger.info("Filling missing values in the features DataFrame.")
-
- # Fill best allele string values
- if"netmhcpan_best_allele"infeatures_df.columns:
- features_df["netmhcpan_best_allele"].fillna("Unknown",inplace=True)
-
- # Fill numeric values with median for all allele features
- ifself.mode=="all":
- foralleleinself.alleles:
- formetricin["score","affinity","percentile_rank"]:
- col=f"netmhcpan_{metric}_{allele}"
- ifcolinfeatures_df.columns:
- median_value=features_df[col].median()
- features_df[col].fillna(median_value,inplace=True)
-
- # Fill numeric values for best allele features
- formetricin["best_score","best_affinity","best_percentile_rank"]:
- col=f"netmhcpan_{metric}"
- ifcolinfeatures_df.columnsandfeatures_df[col].isna().any():
- median_value=features_df[col].median()
- # If all values are NA, median will be NA, so use 0 instead
- median_value=0ifpd.isna(median_value)elsemedian_value
- features_df[col].fillna(median_value,inplace=True)
-
- logger.info("Filled missing values in the features DataFrame.")
- returnfeatures_df
-
-
-
-[docs]
- defpredictions_to_dataframe(self)->pd.DataFrame:
-"""
- Convert the predictions to a DataFrame.
-
- Returns
- -------
- pd.DataFrame
- DataFrame containing the predictions.
-
- Raises
- ------
- ValueError
- If no predictions are available.
- """
- ifself.predictionsisNone:
- raiseValueError(
- "No predictions available. Please run 'generate_features' first."
- )
- returnself.predictions
-
-
-
- # def get_best_allele(self) -> pd.DataFrame:
- # """
- # Return the best allele (with the lowest percentile rank) for each peptide across all alleles.
-
- # Returns:
- # pd.DataFrame: DataFrame containing the best allele information.
- # """
- # logger.info("Getting best allele information.")
- # predictions_df = self._predict()
-
- # features_df = pd.DataFrame({'Peptide': self.peptides})
- # best_features_df = self._generate_best_allele_features(predictions_df, features_df)
- # best_columns = ['Peptide', 'netmhcpan_best_allele', 'netmhcpan_best_score',
- # 'netmhcpan_best_affinity', 'netmhcpan_best_percentile_rank']
- # best_allele_df = best_features_df[best_columns]
- # best_allele_df = self._fill_missing_values(best_allele_df)
-
- # logger.info(f"Generated best allele information for {len(best_allele_df)} peptides.")
- # return best_allele_df
-
-[docs]
-classOverlappingPeptideFeatureGenerator(BaseFeatureGenerator):
-"""
- Generates features based on peptide sequence overlaps using the Overlap-Layout-Consensus (OLC) algorithm.
-
- This generator constructs an overlap graph of peptides, removes transitive edges, simplifies the graph to contigs,
- and computes features such as the number of overlaps, log-transformed overlap counts, overlap ranks, and contig lengths.
- It also filters out peptides with low entropy or outlier lengths before processing.
- Additionally, it records detailed information about brother peptides and contigs, accessible via the get_all_data method.
-
- Parameters
- ----------
- peptides : list of str
- List of peptide sequences.
- min_overlap_length : int, optional
- Minimum required overlap length for peptides to be considered overlapping. Default is 6.
- min_length : int, optional
- Minimum peptide length to include in processing. Default is 7.
- max_length : int, optional
- Maximum peptide length to include in processing. Default is 60.
- min_entropy : float, optional
- Minimum Shannon entropy for peptides to include in processing. Default is 0.
- fill_missing : str, optional
- Method to fill missing values for filtered peptides. Options are 'median' or 'zero'. Default is 'median'.
- remove_pre_nxt_aa : bool, optional
- Whether to remove the preceding and following amino acids from peptides. Default is False.
- remove_modification : bool, optional
- Whether to remove modifications from peptides. Default is True.
-
- Attributes
- ----------
- original_peptides : list of str
- Original list of peptide sequences.
- min_overlap_length : int
- Minimum required overlap length.
- min_length : int
- Minimum peptide length.
- max_length : int
- Maximum peptide length.
- min_entropy : float
- Minimum Shannon entropy.
- fill_missing : str
- Method to fill missing values.
- remove_pre_nxt_aa : bool
- Whether to remove preceding and following amino acids.
- remove_modification : bool
- Whether to remove modifications.
- filtered_peptides : list of str
- List of peptides after filtering.
- filtered_indices : list of int
- Indices of filtered peptides.
- peptide_to_index : dict of str to int
- Mapping of peptides to their indices.
- overlap_data : pd.DataFrame
- DataFrame containing overlap data.
- peptide_to_contig : dict of str to int
- Mapping of peptides to their contig indices.
- assembled_contigs : list of dict
- List of assembled contigs.
- full_data : pd.DataFrame
- Full data including brother peptides and contig information.
- _overlap_graph : nx.DiGraph
- Overlap graph.
- _simplified_graph : nx.DiGraph
- Simplified graph with transitive edges removed.
-
- Notes
- -----
- Key Data Structures:
- 1. contigs: List[List[str]]
- - Represents non-branching paths in the overlap graph
- - Each inner list contains peptide sequences that form a continuous chain
- - Example: [['PEPTIDE1', 'PEPTIDE2'], ['PEPTIDE3']]
-
- 2. assembled_contigs: List[Dict]
- - Contains the assembled sequences and their constituent peptides
- - Each dictionary has two keys:
- 'sequence': The merged/assembled sequence of overlapping peptides
- 'peptides': List of peptides that were used to build this contig
- - Example: [
- {
- 'sequence': 'LONGPEPTIDESEQUENCE',
- 'peptides': ['LONGPEP', 'PEPTIDE', 'SEQUENCE']
- },
- {
- 'sequence': 'SINGLEPEPTIDE',
- 'peptides': ['SINGLEPEPTIDE']
- }
- ]
-
- 3. peptide_to_contig: Dict[str, int]
- - Maps each peptide to its contig index in assembled_contigs
- - Key: peptide sequence
- - Value: index of the contig containing this peptide
- - Example: {
- 'LONGPEP': 0,
- 'PEPTIDE': 0,
- 'SEQUENCE': 0,
- 'SINGLEPEPTIDE': 1
- }
-
- 4. overlap_graph (_overlap_graph): nx.DiGraph
- - Directed graph representing all possible overlaps between peptides
- - Nodes: peptide sequences
- - Edges: overlaps between peptides
- - Edge weights: length of overlap
-
- 5. simplified_graph (_simplified_graph): nx.DiGraph
- - Simplified version of overlap_graph with transitive edges removed
- - Used for final contig assembly
- - More efficient representation of essential overlaps
- """
-
- def__init__(
- self,
- peptides:List[str],
- min_overlap_length:int=6,
- min_length:int=7,
- max_length:int=60,
- min_entropy:float=0,
- fill_missing:str="median",# 'median' or 'zero'
- remove_pre_nxt_aa:bool=False,
- remove_modification:bool=True,
- *args,
- **kwargs,
- ):
- self.original_peptides=peptides
- self.min_overlap_length=min_overlap_length
- self.min_length=min_length
- self.max_length=max_length
- self.min_entropy=min_entropy
- self.fill_missing=fill_missing.lower()
- self.remove_pre_nxt_aa=remove_pre_nxt_aa
- self.remove_modification=remove_modification
- self.filtered_peptides=[]
- self.filtered_indices=[]
- self.peptide_to_index={}
- self.overlap_data=None
- self.peptide_to_contig={}
- self.assembled_contigs=[]
- self.full_data=None
- self._overlap_graph=None
- self._simplified_graph=None
- logger.info(
- f"Initialized OverlappingPeptideFeatureGenerator with {len(peptides)} peptides and minimum overlap length: {min_overlap_length}"
- )
- logger.info(
- f"remove_pre_nxt_aa: {remove_pre_nxt_aa}, remove_modification: {remove_modification}"
- )
- logger.info(
- f"Peptide filtering parameters - min_length: {min_length}, max_length: {max_length}, min_entropy: {min_entropy}"
- )
-
- @property
- defid_column(self)->List[str]:
-"""
- Returns a list of input columns required for the feature generator.
-
- Returns:
- List[str]: List of input columns.
- """
- return["Peptide"]
-
- @property
- deffeature_columns(self)->List[str]:
-"""Returns the feature column names."""
-"""
- return ['contig_member_count', 'log_contig_member_count', 'contig_member_rank', 'log_contig_member_rank',
- 'contig_seq_length_diff', 'contig_length', 'contig_extension_ratio']
- """
- return[
- "contig_member_count",
- "contig_extension_ratio",
- "contig_member_rank",
- "contig_length",
- ]
-
- @property
- defoverlap_graph(self)->nx.DiGraph:
-"""Returns the overlap graph."""
- returnself._overlap_graph
-
- @property
- defsimplified_graph(self)->nx.DiGraph:
-"""Returns the layout graph."""
- returnself._simplified_graph
-
- @property
- defcontigs(self)->List[Dict]:
-"""Returns the assembled contigs."""
- returnself.assembled_contigs
-
-
-[docs]
- def_filter_peptides(self,peptides:List[str])->List[str]:
-"""
- Filter out peptides based on length and entropy.
-
- Parameters:
- peptides (List[str]): List of peptide sequences.
-
- Returns:
- List[str]: Filtered list of peptide sequences.
- """
- filtered_peptides=[]
- forpeptideinpeptides:
- iflen(peptide)<self.min_lengthorlen(peptide)>self.max_length:
- continue
- entropy_val=self._shannon_entropy(peptide)
- ifentropy_val<self.min_entropy:
- continue
- filtered_peptides.append(peptide)
- logger.info(
- f"Filtered out {len(peptides)-len(filtered_peptides)} peptides based on length and entropy."
- )
- logger.info(f"Remaining peptides: {len(filtered_peptides)}")
- returnfiltered_peptides
-
-
-
-[docs]
- def_construct_prefix_index(
- self,peptides:List[str],min_overlap_length:int
- )->Dict[str,List[int]]:
-"""
- Construct an index of prefixes for all peptides.
-
- Parameters:
- peptides (List[str]): List of peptide sequences.
-
- Returns:
- Dict[str, List[int]]: Dictionary mapping prefixes to list of peptide indices.
- """
- prefix_index=defaultdict(list)
- foridx,seqinenumerate(peptides):
- seq_len=len(seq)
- # Store prefixes of length from min_overlap_length up to seq_len
- foriinrange(min_overlap_length,seq_len+1):
- prefix=seq[:i]
- prefix_index[prefix].append(idx)
- returnprefix_index
-
-
-
-[docs]
- def_build_overlap_graph(
- self,peptides:List[str],prefix_index:Dict[str,List[int]]
- )->nx.DiGraph:
-"""
- Build the overlap graph from the list of peptides.
-
- Parameters:
- peptides (List[str]): List of peptide sequences.
- prefix_index (Dict[str, List[int]]): Index of prefixes.
-
- Returns:
- nx.DiGraph: Overlap graph.
- """
- G=nx.DiGraph()
- foridx,peptideinenumerate(peptides):
- seq_len=len(peptide)
- G.add_node(peptide)
- # Iterate over possible suffix lengths
- foriinrange(self.min_overlap_length,seq_len):
- suffix=peptide[-i:]
- ifsuffixinprefix_index:
- formatching_idxinprefix_index[suffix]:
- ifmatching_idx!=idx:
- matching_peptide=peptides[matching_idx]
- overlap_length=len(suffix)
- # Ensure the overlap is the maximal possible
- existing_weight=(
- G[peptide][matching_peptide]["weight"]
- ifG.has_edge(peptide,matching_peptide)
- else0
- )
- ifoverlap_length>existing_weight:
- G.add_edge(
- peptide,matching_peptide,weight=overlap_length
- )
- # Handle the case where the entire peptide matches the prefix of another peptide
- suffix=peptide# Full peptide as suffix
- ifsuffixinprefix_index:
- formatching_idxinprefix_index[suffix]:
- ifmatching_idx!=idx:
- matching_peptide=peptides[matching_idx]
- G.add_edge(peptide,matching_peptide,weight=seq_len)
- returnG
-
-
-
-[docs]
- def_remove_transitive_edges(self,G:nx.DiGraph)->nx.DiGraph:
-"""
- Remove transitive edges from the overlap graph G.
-
- Parameters:
- G (nx.DiGraph): Overlap graph.
-
- Returns:
- nx.DiGraph: Simplified graph with transitive edges removed.
- """
- logger.info("Removing transitive edges from the overlap graph.")
-
- ## if A->B->C, A->C, then remove A->C
- to_remove=[]
- foruinG:
- forvinG.successors(u):
- forwinG.successors(v):
- ifG.has_edge(u,w):
- to_remove.append((u,w))
- G.remove_edges_from(to_remove)
-
- to_remove=[]
- foruinG:
- forvinG.successors(u):
- forwinG.successors(v):
- forxinG.successors(w):
- ifG.has_edge(u,x):
- to_remove.append((u,x))
- G.remove_edges_from(to_remove)
-
- returnG
-
-
-
-[docs]
- def_simplify_graph_to_contigs(self,G:nx.DiGraph)->List[List[str]]:
-"""
- Simplify the graph by finding non-branching paths (contigs).
-
- Parameters:
- G (nx.DiGraph): Overlap graph.
-
- Returns:
- List[List[str]]: List of contigs, each contig is a list of peptide sequences.
- """
- logger.info("Simplifying graph to contigs (non-branching paths).")
- contigs=[]
- visited=set()
-
- fornodeinG.nodes():
- # Check if node is a potential starting point of a contig
- ifG.in_degree(node)!=1orG.out_degree(node)!=1:
- ifnodenotinvisited:
- contig=[node]
- visited.add(node)
- # Extend contig forward
- current_node=node
- whileG.out_degree(current_node)==1:
- successor=next(G.successors(current_node))
- ifG.in_degree(successor)==1andsuccessornotinvisited:
- contig.append(successor)
- visited.add(successor)
- current_node=successor
- else:
- break
- contigs.append(contig)
-
- logger.info(f"Found {len(contigs)} contigs.")
- returncontigs
-
-
-
-[docs]
- def_assemble_contigs(self,contigs:List[List[str]],G:nx.DiGraph)->List[Dict]:
-"""
- Assemble sequences for each contig.
-
- Parameters:
- contigs (List[List[str]]): List of contigs.
-
- Returns:
- List[Dict]: List of assembled contigs with sequence and peptides.
- """
- logger.info("Assembling contigs from peptides.")
- assembled_contigs=[]
- forcontigincontigs:
- iflen(contig)==1:
- # Single node contig
- assembled_seq=contig[0]
- else:
- assembled_seq=contig[0]
- foriinrange(1,len(contig)):
- # Get the overlap length between consecutive peptides
- overlap_length=G[contig[i-1]][contig[i]]["weight"]
- # Append non-overlapping part of the next peptide
- assembled_seq+=contig[i][overlap_length:]
- assembled_contigs.append({"sequence":assembled_seq,"peptides":contig})
- returnassembled_contigs
-
-
-
-[docs]
- def_map_peptides_to_contigs(self,assembled_contigs:List[Dict]):
-"""
- Map each peptide to its contig.
-
- Parameters:
- assembled_contigs (List[Dict]): List of assembled contigs.
- """
- logger.info("Mapping peptides to contigs.")
- peptide_to_contig={}
- foridx,contiginenumerate(assembled_contigs):
- peptides_in_contig=contig["peptides"]
- forpeptideinpeptides_in_contig:
- peptide_to_contig[peptide]=idx
- returnpeptide_to_contig
-
-
-
-[docs]
- def_remove_redundant_peptides(
- self,peptides:List[str]
- )->Tuple[List[str],Dict[str,str]]:
-"""
- Remove peptides that are fully contained in other peptides.
-
- Parameters:
- peptides (List[str]): List of peptide sequences.
-
- Returns:
- accepted_peptides (List[str]): List of accepted peptides not redundant (largest container peptides).
- redundant_mapping (Dict[str, str]): Mapping of redundant peptide to its largest container peptide.
- """
- sorted_peptides=sorted(peptides,key=len,reverse=True)
- min_pep_len=len(sorted_peptides[-1])
- peptides_set=set(sorted_peptides)
- accepted_set=set()
- redundant_mapping={}
- forpepinsorted_peptides:
- ifpepnotinpeptides_set:
- continue
- ifpepinaccepted_set:
- continue
- accepted_set.add(pep)
- pep_len=len(pep)
- forLinrange(min_pep_len,pep_len+1):
- foriinrange(0,pep_len-L+1):
- sub_pep=pep[i:i+L]
- ifsub_pep==pep:
- continue
- ifsub_pepinpeptides_set:
- redundant_mapping[sub_pep]=pep
- peptides_set.remove(sub_pep)
-
- logger.info(f"Remove {len(redundant_mapping)} redundant peptides.")
- logger.info(f"Accepted {len(accepted_set)} non-redundant peptides.")
- returnlist(accepted_set),redundant_mapping
-
-
-
-[docs]
- def_build_full_contig_map(self,peptides:List[str])->Dict[int,List[str]]:
-"""
- Build a mapping from contig index to list of peptides (both accepted and redundant).
-
- Parameters:
- peptides (List[str]): Original list of peptides.
-
- Returns:
- Dict[int, List[str]]: Mapping from contig index to list of peptides.
- """
- full_contig_map=defaultdict(list)
- forpepinpeptides:
- contig_idx=self.peptide_to_contig.get(pep,None)
- ifcontig_idxisnotNone:
- full_contig_map[contig_idx].append(pep)
- returnfull_contig_map
-
-
-
-[docs]
- def_calculate_overlap_contig_features(self,peptides:List[str])->pd.DataFrame:
-"""
- Calculate overlap and contig related features for peptides.
-
- This method computes the overlap graph, simplifies it to contigs, assembles contigs,
- and generates features for each peptide. It also handles redundant peptides by mapping them
- to their container peptides.
-
- Parameters:
- peptides (List[str]): List of peptide sequences.
-
- Returns:
- pd.DataFrame: DataFrame containing the computed features.
- """
- accepted_peptides,redundant_mapping=self._remove_redundant_peptides(peptides)
-
- logger.info("Constructing prefix index...")
- prefix_index=self._construct_prefix_index(
- accepted_peptides,self.min_overlap_length
- )
-
- logger.info("Building overlap graph...")
- self._overlap_graph=self._build_overlap_graph(accepted_peptides,prefix_index)
-
- logger.info(
- f"Overlap graph has {self._overlap_graph.number_of_nodes()} nodes and {self._overlap_graph.number_of_edges()} edges."
- )
- self._simplified_graph=self._remove_transitive_edges(self._overlap_graph)
-
- logger.info(
- f"Simplified graph has {self._simplified_graph.number_of_nodes()} nodes and {self._simplified_graph.number_of_edges()} edges."
- )
- contigs=self._simplify_graph_to_contigs(self._simplified_graph)
-
- logger.info(f"Found {len(contigs)} contigs.")
- self.assembled_contigs=self._assemble_contigs(contigs,self._simplified_graph)
- self.peptide_to_contig=self._map_peptides_to_contigs(self.assembled_contigs)
-
- # Map redundant peptides to their container peptides
- forredundant_peptide,container_peptideinredundant_mapping.items():
- ifcontainer_peptidenotinself.peptide_to_contig:
- logger.debug(
- f"Container peptide {container_peptide} not found in contigs."
- )
- logger.debug(
- f"This may occur if the container peptide is a branching node in the overlap graph."
- )
- logger.debug(f"Assigning {container_peptide} to its own contig.")
-
- new_contig_idx=len(self.assembled_contigs)
- new_contig={
- "sequence":container_peptide,
- "peptides":[container_peptide],
- "full_contig_peptides":[container_peptide],
- }
- self.assembled_contigs.append(new_contig)
- self.peptide_to_contig[container_peptide]=new_contig_idx
-
- self.peptide_to_contig[redundant_peptide]=self.peptide_to_contig[
- container_peptide
- ]
-
- # Build a mapping from contig index to list of peptides (both accepted and redundant)
- full_contig_map=self._build_full_contig_map(peptides)
-
- forcontig_index,full_peptidesinfull_contig_map.items():
- self.assembled_contigs[contig_index]["full_contig_peptides"]=full_peptides
-
- # Compute features for each peptide in the filtered list
- feature_list=[]
- forpepinpeptides:
- contig_idx=self.peptide_to_contig.get(pep,None)
- ifcontig_idxisnotNone:
- full_count=len(full_contig_map[contig_idx])
- contig_member_count=full_count
- contig_length=len(self.assembled_contigs[contig_idx]["sequence"])
- else:
- contig_member_count=0
- contig_length=len(pep)
- feature_list.append(
- {
- "clean_peptide":pep,
- "contig_member_count":contig_member_count,
- "contig_length":contig_length,
- }
- )
-
- features_df=pd.DataFrame(feature_list)
- features_df["log_contig_member_count"]=features_df[
- "contig_member_count"
- ].apply(lambdax:np.log(x+1e-6))
- features_df["contig_member_rank"]=features_df["contig_member_count"].rank(
- method="min",ascending=False
- )
- features_df["log_contig_member_rank"]=features_df["contig_member_rank"].apply(
- lambdax:np.log(x+1e-6)
- )
- features_df["contig_seq_length_diff"]=features_df[
- "contig_length"
- ]-features_df["clean_peptide"].apply(len)
- features_df["contig_extension_ratio"]=features_df[
- "contig_seq_length_diff"
- ]/features_df["clean_peptide"].apply(len)
-
- returnfeatures_df
-
-
-
-[docs]
- def_integrate_overlap_features(self)->pd.DataFrame:
-"""
- Compute the features for each peptide, ensuring output aligns with input.
-
- This method preprocesses and filters peptides, calculates overlap and contig features,
- maps them back to the original peptides, and fills missing values.
-
- Returns:
- pd.DataFrame: DataFrame containing features for each peptide.
- """
- ifself.overlap_dataisNone:
- # 1. Preprocess and filter peptides
- self.overlap_data=pd.DataFrame(
- self.original_peptides,columns=["Peptide"]
- )
- self.overlap_data["clean_peptide"]=self.overlap_data["Peptide"].apply(
- self._preprocess_peptides
- )
- self.filtered_peptides=self._filter_peptides(
- self.overlap_data["clean_peptide"].unique().tolist()
- )
-
- # 2. Compute overlaps and features for filtered peptides
- features_df=self._calculate_overlap_contig_features(
- self.filtered_peptides
- )
-
- # 3. Map features back to the original peptides
- logger.info("Mapping features back to original peptides.")
- self.overlap_data=self.overlap_data.merge(
- features_df,on="clean_peptide",how="left"
- )
-
- # 4. Handle missing features for peptides that were filtered out
- missing_counts=self.overlap_data["contig_member_count"].isna().sum()
- logger.info(
- f"Number of peptides with missing features (filtered out): {missing_counts}"
- )
- ifself.fill_missing=="median":
- logger.info("Filling missing values with median.")
- median_values={
- "contig_member_count":self.overlap_data[
- "contig_member_count"
- ].median(),
- "log_contig_member_count":self.overlap_data[
- "log_contig_member_count"
- ].median(),
- "contig_member_rank":self.overlap_data[
- "contig_member_rank"
- ].median(),
- "log_contig_member_rank":self.overlap_data[
- "log_contig_member_rank"
- ].median(),
- "contig_length":self.overlap_data["contig_length"].median(),
- "contig_seq_length_diff":self.overlap_data[
- "contig_seq_length_diff"
- ].median(),
- "contig_extension_ratio":self.overlap_data[
- "contig_extension_ratio"
- ].median(),
- }
- self.overlap_data.fillna(value=median_values,inplace=True)
- elifself.fill_missing=="zero":
- logger.info("Filling missing values with zero.")
- self.overlap_data.fillna(value=0,inplace=True)
- else:
- logger.warning(
- f"Invalid fill_missing option '{self.fill_missing}'. Defaulting to zero."
- )
- self.overlap_data.fillna(value=0,inplace=True)
- logger.info("Feature computation completed.")
- else:
- logger.info("Features have already been computed. Skipping recomputation.")
- returnself.overlap_data
-
-
-
-[docs]
- defgenerate_features(self)->pd.DataFrame:
-"""
- Generates features for peptide overlaps, including the count of overlapping peptides, contig length,
- and log-transformed counts and ranks.
-
- Returns:
- pd.DataFrame: DataFrame containing the features.
- """
- features_df=self._integrate_overlap_features()
- features_df=features_df[["Peptide"]+self.feature_columns]
- logger.info(f"Generated overlap features for {len(features_df)} peptides.")
- returnfeatures_df
-
-
-
-[docs]
- defget_full_data(self)->pd.DataFrame:
-"""
- Returns the full data including brother peptides and contig information for each peptide.
- In the output, the lists of contig peptides and brother peptides include redundant peptides,
- so that their counts match the corresponding peptide and contig_member_count.
-
- Returns:
- pd.DataFrame: DataFrame containing peptides and their brother peptides and contigs.
- """
- self._integrate_overlap_features()
- ifself.full_dataisnotNone:
- logger.info("Full data has already been computed. Returning cached data.")
- returnself.full_data
- data_list=[]
-
- forpeptideintqdm(self.filtered_peptides):
- contig_idx=self.peptide_to_contig.get(peptide,None)
- ifcontig_idxisnotNone:
- contig_info=self.assembled_contigs[contig_idx]
- # Use full contig peptides (including redundant ones) if available
- full_peptides=contig_info.get(
- "full_contig_peptides",contig_info["peptides"]
- )
- brother_peptides=[pforpinfull_peptidesifp!=peptide]
- data_list.append(
- {
- "clean_peptide":peptide,
- "BrotherPeptides":brother_peptides,
- "ContigSequence":contig_info["sequence"],
- "ContigPeptides":full_peptides,
- }
- )
-
- full_data_df=pd.DataFrame(data_list)
- self.full_data=self.overlap_data.merge(
- full_data_df,on="clean_peptide",how="left"
- )
- returnself.full_data
-
-
-
-
-'''
-# TODO: test
-
-def assign_brother_aggregated_feature(
- psms: PsmContainer,
- feature_columns: Union[str, List[str]],
- overlapping_source: str,
- source_name: str = 'OverlappingGroupFeatures'
-) -> None:
- """
- Assign aggregated features based on brother peptides to the PSMs.
-
- For PSMs with the same ContigSequence (brother peptides), compute the mean of specified features
- and assign these aggregated features back to each PSM in the group.
- If a PSM does not have a ContigSequence (no brothers), its new features will be set to the original values.
-
- Metadata in the PSM container:
- {
- "source_name": {
- "metadata_field_1": "value1",
- "metadata_field_2": "value2"
- }
- }
-
- Parameters:
- psms (PsmContainer): PSM container containing the peptides and features.
- feature_columns (Union[str, List[str]]): Name of the feature column(s) to aggregate.
- overlapping_source (str): Source name of the overlapping peptide features.
- source_name (str): Name of the new feature source.
-
- Returns:
- None
- """
- if isinstance(feature_columns, str):
- feature_columns = [feature_columns]
- psms_df = psms.psms
-
- if psms.metadata_column is None:
- raise ValueError("The PSMs do not contain metadata.")
- metadata = psms_df[psms.metadata_column]
- print(metadata)
-
-
- def get_overlapping_data(x):
- try:
- return x.get(overlapping_source, {})
- except AttributeError:
- logger.error(f"Metadata for PSM {x} is not a dictionary.")
- return {}
-
- def get_contig_sequence(x):
- try:
- return x.get('ContigSequence', None)
- except AttributeError:
- logger.error(f"Invalid metadata for PSM {x}.")
- return None
-
- overlapping_data = metadata.apply(get_overlapping_data)
- contig_sequences = overlapping_data.apply(get_contig_sequence)
- print(overlapping_data)
- print(contig_sequences)
-
- psms_df['ContigSequence'] = contig_sequences
-
- for feature in feature_columns:
- if feature not in psms_df.columns:
- raise ValueError(f"Feature column '{feature}' not found in PSMs.")
-
- grouped_mean = psms_df.groupby('ContigSequence')[feature_columns].mean().reset_index()
- #grouped_sum = psms_df.groupby('ContigSequence')[feature_columns].sum().reset_index()
-
- """
- grouped = grouped_mean.merge(grouped_sum,
- on='ContigSequence',
- suffixes=('_brother_mean', '_brother_sum'))
- """
- psms_with_agg = psms_df.merge(grouped_mean,
- on='ContigSequence',
- how='left',
- suffixes=('', '_brother_mean'))
-
-
- # use the original feature values if the aggregated values are missing
- for feature in feature_columns:
- mean_feature = feature + '_brother_mean'
- sum_feature = feature + '_brother_sum'
- psms_with_agg[mean_feature].fillna(psms_with_agg[feature], inplace=True)
- psms_with_agg[sum_feature].fillna(psms_with_agg[feature], inplace=True)
-
-
- agg_feature_columns = []
- for feature in feature_columns:
- mean_feature = feature + '_brother_mean'
- sum_feature = feature + '_brother_sum'
- agg_feature_columns.append(mean_feature)
- agg_feature_columns.append(sum_feature)
-
- new_features_df = psms_with_agg[agg_feature_columns]
- new_features_df.columns = agg_feature_columns
-
- psms.add_features_by_index(new_features_df, source=source_name)
-
-
-'''
-
-
-
-[docs]
-defassign_brother_aggregated_feature(
- psms:PsmContainer,
- feature_columns:Union[str,List[str]],
- overlapping_source:str,
- source_name:str="OverlappingGroupFeatures",
-)->None:
-"""
- Assign aggregated features based on brother peptides to the PSMs.
-
- For PSMs with the same ContigSequence (brother peptides), compute the mean of specified features
- and assign these aggregated features back to each PSM in the group. Additionally, compute
- the sum as mean * (contig_member_count + 1). If a PSM does not have a ContigSequence (no brothers),
- its new features will be set to the original values.
-
- Parameters:
- psms (PsmContainer): PSM container containing the peptides and features.
- feature_columns (Union[str, List[str]]): Name of the feature column(s) to aggregate.
- overlapping_source (str): Source name of the overlapping peptide features.
- source_name (str): Name of the new feature source.
-
- Returns:
- None
- """
- ifisinstance(feature_columns,str):
- feature_columns=[feature_columns]
- psms_df=psms.psms
-
- ifpsms.metadata_columnisNone:
- raiseValueError("The PSMs do not contain metadata.")
- metadata=psms_df[psms.metadata_column]
-
- defget_overlapping_data(x):
- ifisinstance(x,dict):
- returnx.get(overlapping_source,{})
- else:
- logger.warning(f"Invalid metadata entry: {x}")
- return{}
-
- overlapping_data=metadata.apply(get_overlapping_data)
-
- defget_contig_sequence(x):
- ifisinstance(x,dict):
- returnx.get("ContigSequence",None)
- else:
- logger.warning(f"Invalid overlapping data entry: {x}")
- returnNone
-
- contig_sequences=overlapping_data.apply(get_contig_sequence)
-
- psms_df["ContigSequence"]=contig_sequences
-
- if"contig_member_count"notinpsms_df.columns:
- raiseValueError("'contig_member_count' column not found in PSMs.")
-
- missing_features=[
- featureforfeatureinfeature_columnsiffeaturenotinpsms_df.columns
- ]
- ifmissing_features:
- raiseValueError(f"Feature columns not found in PSMs: {missing_features}")
-
- grouped_mean=(
- psms_df.groupby("ContigSequence")[feature_columns].mean().reset_index()
- )
- grouped_mean=grouped_mean.rename(
- columns={feature:f"{feature}_contig_avg"forfeatureinfeature_columns}
- )
-
- psms_with_agg=psms_df.merge(grouped_mean,on="ContigSequence",how="left")
-
- forfeatureinfeature_columns:
- mean_feature=f"{feature}_contig_avg"
- sum_feature=f"{feature}_contig_sum"
- psms_with_agg["contig_member_count"]=psms_with_agg[
- "contig_member_count"
- ].fillna(0)
- psms_with_agg[sum_feature]=psms_with_agg[mean_feature]*(
- psms_with_agg["contig_member_count"]
- )
- psms_with_agg[sum_feature].fillna(psms_with_agg[feature],inplace=True)
-
- forfeatureinfeature_columns:
- mean_feature=f"{feature}_contig_avg"
- psms_with_agg[mean_feature].fillna(psms_with_agg[feature],inplace=True)
-
- agg_feature_columns=[f"{feature}_contig_avg"forfeatureinfeature_columns]+[
- f"{feature}_contig_sum"forfeatureinfeature_columns
- ]
-
- new_features_df=psms_with_agg[agg_feature_columns]
-
- psms.add_features_by_index(features_df=new_features_df,source=source_name)
-[docs]
-classSpectraSimilarityFeatureGenerator(BaseFeatureGenerator):
-"""
- Feature generator for calculating similarity between experimental and predicted spectra.
-
- This class works through the following steps:
- 1. Extract experimental spectral data from mzML files
- 2. Use Koina for theoretical spectra prediction
- 3. Align experimental and predicted spectra
- 4. Calculate similarity metrics as features
-
- Parameters:
- peptides (List[str]): List of peptide sequences
- charges (List[int]): List of charge states
- scan_ids (List[int]): List of scan IDs
- mz_file_paths (List[str]): List of mzML file paths
- model_type (str): Prediction model type, either "HCD" or "CID"
- collision_energies (List[float]): List of collision energies, required when model_type is "HCD"
- remove_pre_nxt_aa (bool): Whether to remove preceding and next amino acids, default is True
- remove_modification (bool): Whether to remove modifications, default is True
- url (str): Koina server URL, default is "koina.wilhelmlab.org:443"
- top_n (int): Number of top peaks to use for alignment, default is 12
- tolerance_ppm (float): Mass tolerance for alignment in ppm, default is 20
- """
-
- def__init__(
- self,
- spectrum_ids:List[str],
- peptides:List[str],
- charges:List[int],
- scan_ids:List[int],
- mz_file_paths:List[str],
- model_type:str,
- collision_energies:List[float]=None,
- instruments:List[str]=None,
- fragmentation_types:List[str]=None,
- remove_pre_nxt_aa:bool=False,
- mod_dict:Optional[Dict[str,str]]=None,
- url:str="koina.wilhelmlab.org:443",
- top_n:int=36,
- tolerance_ppm:float=20,
- ):
- self.spectrum_ids=spectrum_ids
- self.peptides=peptides
- self.charges=charges
- self.scan_ids=scan_ids
- self.mz_file_paths=mz_file_paths
- self.model_type=model_type
- self.collision_energies=collision_energies
- self.instruments=instruments
- self.fragmentation_types=fragmentation_types
- self.remove_pre_nxt_aa=remove_pre_nxt_aa
- self.mod_dict=mod_dict
- self.url=url
- self.top_n=top_n
- self.tolerance_ppm=tolerance_ppm
- self.results=None
- self._raw_predictions=None
-
- logger.info(
- f"Initializing SpectraSimilarityFeatureGenerator with {len(peptides)} PSMs"
- )
- logger.info(f"Using model: {self.model_type}")
-
- self.df=pd.DataFrame(
- {
- "spectrum_id":self.spectrum_ids,
- "scan":self.scan_ids,
- "peptide":self.peptides,
- "charge":self.charges,
- "mz_file_path":self.mz_file_paths,
- }
- )
-
- self.df["processed_peptide"]=self.df["peptide"].apply(
- self._preprocess_peptide
- )
- logger.info(
- f"Recevied {len(self.df)} PSMs for spectral similarity feature generation"
- )
-
- @property
- defid_column(self)->List[str]:
-"""
- Returns a list of input columns required for the feature generator.
- """
- return["spectrum_id","peptide","charge"]
-
- @property
- deffeature_columns(sself)->List[str]:
-"""
- Returns a list of feature columns generated by the feature generator.
- """
- return[
- "spectral_angle_similarity",
- "cosine_similarity",
- "pearson_correlation",
- "spearman_correlation",
- "mean_squared_error",
- "unweighted_entropy_similarity",
- "predicted_seen_nonzero",
- "predicted_not_seen",
- ]
-
-
-[docs]
- definput_df(self)->pd.DataFrame:
-"""
- Return the generated features as a DataFrame.
-
- Returns:
- pd.DataFrame: DataFrame containing the generated features
- """
- returnself.df
-
-
-
-[docs]
- def_preprocess_peptide(self,peptide:str)->str:
-"""
- Preprocess peptide sequence.
-
- As Prosit does not support non-standard amino acid 'U', we replace it with 'C'.
-
- Parameters
- ----------
- peptide : str
- Original peptide sequence.
-
- Returns
- -------
- str
- Processed peptide sequence.
-
- Notes
- -----
- This is nonsense when it comes to spectral prediction, but we need to keep it
- for compatibility with Koina. In the future, this should be prohibited at the input level.
- """
- processed_peptide=peptide
-
- ifself.remove_pre_nxt_aa:
- processed_peptide=utils.remove_pre_and_nxt_aa(processed_peptide)
-
- ifself.mod_dictisNone:
- processed_peptide=utils.remove_modifications(processed_peptide)
- else:
- formod,replacementinself.mod_dict.items():
- processed_peptide=processed_peptide.replace(mod,replacement)
-
- # Replace non-standard amino acid 'U' with 'C'.
- # This is nosense when it comes to spectral prediction.
- # But we need to keep it for compatibility with Koina.
- # In the future, this should be prohibited at the input level.
- if"U"inutils.remove_modifications(processed_peptide):
- logger.warning(
- f"Peptide sequence contains non-standard amino acid 'U': {processed_peptide}. Replacing with 'C'."
- )
- # Replace 'U' with 'C'
- processed_peptide=processed_peptide.replace("U","C")
-
- returnprocessed_peptide
-
-
-
-[docs]
- def_extract_experimental_spectra(self)->pd.DataFrame:
-"""
- Extract experimental spectral data from mzML files.
-
- Returns
- -------
- pd.DataFrame
- DataFrame containing experimental spectral data.
-
- Notes
- -----
- The method groups scan IDs by file path for efficiency and extracts
- spectral data from each mzML file. The resulting DataFrame contains
- m/z values, intensities, and associated metadata for each spectrum.
- """
- logger.info("Extracting experimental spectral data...")
-
- # Group scan IDs by file path for efficiency
- file_to_scans={}
- fori,rowinself.df.iterrows():
- scan_id=row["scan"]
- file_path=row["mz_file_path"]
- iffile_pathnotinfile_to_scans:
- file_to_scans[file_path]=[]
- file_to_scans[file_path].append(scan_id)
-
- exp_spectra_dfs=[]
- forfile_path,scan_idsinfile_to_scans.items():
- logger.info(f"Extracting {len(scan_ids)} scans from {file_path}")
- spectra_df=extract_mzml_data(file_path,scan_ids)
- spectra_df["mz_file_path"]=file_path
- exp_spectra_dfs.append(spectra_df)
-
- # Merge spectral data from all files
- ifexp_spectra_dfs:
- exp_spectra_df=pd.concat(exp_spectra_dfs,ignore_index=True)
- logger.info(
- f"Successfully extracted {len(exp_spectra_df)} experimental spectra"
- )
- returnexp_spectra_df
- else:
- logger.warning("No experimental spectral data found")
- returnpd.DataFrame()
-
-
-
-[docs]
- def_predict_theoretical_spectra(
- self,processed_peptides:List[str],charges:List[int]
- )->pd.DataFrame:
-"""
- Use Koina to predict theoretical spectra.
-
- Parameters
- ----------
- processed_peptides : list of str
- List of preprocessed peptide sequences.
- charges : list of int
- List of charge states.
-
- Returns
- -------
- pd.DataFrame
- DataFrame containing predicted spectral data.
-
- Raises
- ------
- Exception
- If there is an error during Koina prediction.
-
- Notes
- -----
- The method uses Koina to predict theoretical spectra for each peptide.
- For AlphaPeptDeep_ms2_generic model, inputs are split into batches
- grouped by peptide length for prediction.
- """
- logger.info(f"Predicting theoretical spectra using {self.model_type}...")
-
- inputs=pd.DataFrame()
- inputs["peptide_sequences"]=np.array(processed_peptides)
- inputs["precursor_charges"]=np.array(charges)
-
- ifself.collision_energiesisnotNone:
- inputs["collision_energies"]=np.array(self.collision_energies)
-
- ifself.instrumentsisnotNone:
- inputs["instrument_types"]=np.array(self.instruments)
-
- ifself.fragmentation_typesisnotNone:
- inputs["fragmentation_types"]=np.array(self.fragmentation_types)
-
- model=Koina(self.model_type,self.url)
-
- try:
- predictions=model.predict(inputs)
- exceptExceptionase:
- logger.error(f"Error during Koina prediction: {e}")
- logger.error("Koina prediction failed. Please check:")
- logger.error("- Input parameters compatibility")
- logger.error("- Supported modifications")
- logger.error("- Peptide length limits")
- logger.error(f"Details at: https://koina.proteomicsdb.org/")
-
- ifself.model_type=="AlphaPeptDeep_ms2_generic":
- logger.info(
- "Splitting inputs into batches grouped by peptide length for prediction..."
- )
- importre
-
- # Compute peptide length by removing modifications in square brackets.
- inputs["peptide_length"]=inputs["peptide_sequences"].apply(
- lambdax:len(re.sub(r"\[.*?\]","",x))
- )
-
- predictions_batches=[]
- # Group the DataFrame by calculated peptide length. Each group is submitted as one batch.
- forpeptide_length,groupininputs.groupby("peptide_length"):
- logger.info(
- f"Processing {len(group)} entries for peptide length {peptide_length}."
- )
- try:
- batch_predictions=model.predict(group)
- predictions_batches.append(batch_predictions)
- exceptExceptionasbatch_error:
- logger.error(
- f"Error during Koina prediction for batch with peptide length {peptide_length}: {batch_error}"
- )
- logger.error(f"Batch data:\n{group}")
- raise
- predictions=pd.concat(predictions_batches,ignore_index=True)
- else:
- raiseValueError(f"Unsupported model type: {self.model_type}")
-
- # Save the raw prediction results
- self._raw_predictions=predictions.copy()
-
- # Convert prediction results to a suitable format
- pred_df=predictions.copy()
- pred_df.rename(
- columns={
- "peptide_sequences":"processed_peptide",
- "precursor_charges":"charge",
- "intensities":"pred_intensity",
- "mz":"pred_mz",
- },
- inplace=True,
- )
-
- # Group by peptide and charge, convert predicted mz and intensity to lists
- grouped_df=(
- pred_df.groupby(["processed_peptide","charge"])
- .agg({"pred_intensity":list,"pred_mz":list,"annotation":list})
- .reset_index()
- )
-
- logger.info(f"Successfully predicted {len(grouped_df)} theoretical spectra")
- returngrouped_df
-
-
- @property
- defraw_predictions(self)->pd.DataFrame:
-"""
- Returns the raw prediction results from Koina.
-
- Returns:
- pd.DataFrame: Raw prediction results DataFrame
- """
- ifself._raw_predictionsisNone:
- ifself.resultsisNone:
- self.results=self._generate_features()
- returnself._raw_predictions
-
-
-[docs]
- defget_raw_predictions(self)->pd.DataFrame:
-"""
- Get the raw prediction results DataFrame from Koina.
-
- Returns:
- pd.DataFrame: Raw prediction results DataFrame
- """
- returnself.raw_predictions
-
-
-
-[docs]
- defsave_raw_predictions(self,file_path:str,**kwargs)->None:
-"""
- Save the raw prediction results to a file.
-
- Parameters:
- file_path (str): Path to save the file
- **kwargs: Other parameters passed to pandas.DataFrame.to_csv
- """
- if"index"notinkwargs:
- kwargs["index"]=False
- ifself.raw_predictionsisnotNone:
- self.raw_predictions.to_csv(file_path,**kwargs)
- logger.info(f"Raw prediction results saved to: {file_path}")
- else:
- logger.warning("No raw prediction results available to save.")
-
-
-
-[docs]
- def_sort_spectrum_by_mz(
- self,
- mz:List[float],
- intensity:List[float],
- annotation:Optional[List[str]]=None,
- )->Tuple[np.ndarray,np.ndarray,Optional[np.ndarray]]:
-"""
- Sort spectrum (m/z, intensity, and optionally annotation) by m/z values.
-
- Parameters
- ----------
- mz : list of float
- m/z values.
- intensity : list of float
- Intensity values.
- annotation : list of str, optional
- Fragment annotations.
-
- Returns
- -------
- tuple of (np.ndarray, np.ndarray, np.ndarray or None)
- Sorted m/z, intensity, and annotation arrays.
- If annotation is None, the third element will be None.
- """
- mz_array=np.array(mz)
- intensity_array=np.array(intensity)
- sorted_indices=np.argsort(mz_array)
- sorted_mz=mz_array[sorted_indices]
- sorted_intensity=intensity_array[sorted_indices]
-
- sorted_annotation=None
- ifannotationisnotNone:
- annotation_array=np.array(annotation)
- sorted_annotation=annotation_array[sorted_indices]
-
- returnsorted_mz,sorted_intensity,sorted_annotation
-[docs]
- def_get_top_peaks_vectors(
- self,
- aligned_exp_intensity:np.ndarray,
- aligned_pred_intensity:np.ndarray,
- matched_indices:List[Tuple],
- top_n:int,
- )->Tuple[np.ndarray,np.ndarray,List[Tuple]]:
-"""
- Extract top N peaks based on predicted intensity for similarity calculation
-
- Parameters:
- aligned_exp_intensity (np.ndarray): Aligned experimental intensity vector
- aligned_pred_intensity (np.ndarray): Aligned predicted intensity vector
- matched_indices (List[Tuple]): Matching index pairs (pred_idx, exp_idx)
- top_n (int): Number of top peaks to extract
-
- Returns:
- Tuple[np.ndarray, np.ndarray, List[Tuple]]:
- - Top N experimental intensity vector
- - Top N predicted intensity vector
- - Top N matching index pairs
- """
- # Get indices of top N peaks by predicted intensity
- num_peaks=min(top_n,len(aligned_pred_intensity))
- top_pred_indices=np.argsort(-aligned_pred_intensity)[:num_peaks]
-
- # Extract top N peaks
- top_exp_intensity=np.zeros(num_peaks)
- top_pred_intensity=np.zeros(num_peaks)
- top_matched_indices=[]
-
- fori,orig_idxinenumerate(top_pred_indices):
- top_exp_intensity[i]=aligned_exp_intensity[orig_idx]
- top_pred_intensity[i]=aligned_pred_intensity[orig_idx]
- top_matched_indices.append(matched_indices[orig_idx])
-
- returntop_exp_intensity,top_pred_intensity,top_matched_indices
-
-
-
-[docs]
- def_align_spectra(
- self,
- exp_mz:List[float],
- exp_intensity:List[float],
- pred_mz:List[float],
- pred_intensity:List[float],
- pred_annotation:Optional[List[str]]=None,
- )->Tuple[np.ndarray,np.ndarray,List[Tuple]]:
-"""
- Align experimental and predicted spectra.
-
- This is a wrapper around _align_spectra_all_peaks and _get_top_peaks_vectors
- to maintain backward compatibility while using the improved algorithm.
-
- Parameters
- ----------
- exp_mz : list of float
- Experimental m/z values.
- exp_intensity : list of float
- Experimental intensity values.
- pred_mz : list of float
- Predicted m/z values.
- pred_intensity : list of float
- Predicted intensity values.
- pred_annotation : list of str, optional
- Predicted fragment annotations.
-
- Returns
- -------
- tuple of (np.ndarray, np.ndarray, list of tuple)
- - Aligned experimental intensity vector
- - Predicted intensity vector
- - Matching index pairs (for top N peaks)
-
- Notes
- -----
- The method first aligns all peaks using _align_spectra_all_peaks, then
- extracts the top N peaks using _get_top_peaks_vectors for compatibility
- with existing code.
- """
- # First align all peaks
- all_exp_intensity,all_pred_intensity,all_matched_indices,additional_info=(
- self._align_spectra_all_peaks(
- exp_mz,
- exp_intensity,
- pred_mz,
- pred_intensity,
- pred_annotation,
- use_ppm=True,
- )
- )
-
- # Then get top N peaks for compatibility with existing code
- top_exp_intensity,top_pred_intensity,top_matched_indices=(
- self._get_top_peaks_vectors(
- all_exp_intensity,all_pred_intensity,all_matched_indices,self.top_n
- )
- )
-
- returntop_exp_intensity,top_pred_intensity,top_matched_indices
-
-
-
-[docs]
- def_normalize_vector_l2(self,vector:np.ndarray)->np.ndarray:
-"""
- Normalize a vector using L2 normalization (unit vector).
-
- Parameters
- ----------
- vector : np.ndarray
- Input vector to normalize.
-
- Returns
- -------
- np.ndarray
- L2-normalized vector.
-
- Notes
- -----
- If the input vector has zero norm, the original vector is returned unchanged.
- """
- norm=np.linalg.norm(vector)# Calculate the L2 norm (Euclidean norm)
- ifnorm==0:
- returnvector
- returnvector/norm
-
-
-
-[docs]
- def_normalize_vector_sum(self,vector:np.ndarray)->np.ndarray:
-"""
- Perform sum normalization (probability normalization) on a vector.
-
- Parameters
- ----------
- vector : np.ndarray
- Input vector.
-
- Returns
- -------
- np.ndarray
- Sum normalized vector (sum to 1).
-
- Notes
- -----
- If the input vector has zero sum, the original vector is returned unchanged.
- """
- total=np.sum(vector)
- iftotal>0:
- returnvector/total
- returnvector
-
-
-
-[docs]
- def_calculate_spectral_angle_similarity(
- self,exp_vector:np.ndarray,pred_vector:np.ndarray
- )->float:
-"""
- Calculate the spectral angle between experimental and predicted vectors.
-
- Normalize the angle to [0, 1] where 1 is the best similarity.
-
- Parameters
- ----------
- exp_vector : np.ndarray
- Experimental intensity vector.
- pred_vector : np.ndarray
- Predicted intensity vector.
-
- Returns
- -------
- float
- Spectral angle similarity (0-1, higher is better).
-
- Notes
- -----
- The spectral angle is calculated as: SA = 1 - (2 * angle / π),
- where angle is the angle between the normalized vectors in radians.
- """
- exp_norm=self._normalize_vector_l2(exp_vector)
- pred_norm=self._normalize_vector_l2(pred_vector)
- dot_product=np.sum(exp_norm*pred_norm)
-
- # Clamp dot product to [-1, 1] to avoid numerical issues
- dot_product=np.clip(dot_product,-1.0,1.0)
- angle=np.arccos(dot_product)# Angle in radians
-
- # Return similarity score: SA = 1 - (2 * angle / π)
- return1-(2*angle/np.pi)
-
-
-
-[docs]
- def_calculate_cosine_similarity(
- self,exp_vector:np.ndarray,pred_vector:np.ndarray
- )->float:
-"""
- Calculate the cosine similarity between experimental and predicted vectors.
-
- Parameters
- ----------
- exp_vector : np.ndarray
- Experimental intensity vector.
- pred_vector : np.ndarray
- Predicted intensity vector.
-
- Returns
- -------
- float
- Cosine similarity (0-1, higher is better).
-
- Notes
- -----
- The cosine similarity is calculated as 1 - cosine_distance.
- If either vector has zero sum, returns 0.0.
- """
- ifnp.sum(exp_vector)==0ornp.sum(pred_vector)==0:
- return0.0
-
- # Normalize vectors using L2 normalization
- exp_norm=self._normalize_vector_l2(exp_vector)
- pred_norm=self._normalize_vector_l2(pred_vector)
-
- # Use 1 - cosine distance = cosine similarity
- return1.0-cosine(exp_norm,pred_norm)
-
-
-
-[docs]
- def_calculate_spearman_correlation(
- self,exp_vector:np.ndarray,pred_vector:np.ndarray
- )->float:
-"""
- Calculate Spearman correlation coefficient between experimental and predicted vectors.
-
- Parameters
- ----------
- exp_vector : np.ndarray
- Experimental intensity vector.
- pred_vector : np.ndarray
- Predicted intensity vector.
-
- Returns
- -------
- float
- Spearman correlation coefficient (-1 to 1, higher is better).
-
- Notes
- -----
- If either vector has no variation (std = 0), returns 0.0.
- """
- # Handle vectors with no variation
- ifnp.std(exp_vector)==0ornp.std(pred_vector)==0:
- return0.0
-
- try:
- r,_=spearmanr(exp_vector,pred_vector)
- returnr
- except:
- return
-
-
-
-[docs]
- def_calculate_pearson_correlation(
- self,exp_vector:np.ndarray,pred_vector:np.ndarray
- )->float:
-"""
- Calculate Pearson correlation coefficient between experimental and predicted vectors.
-
- Parameters
- ----------
- exp_vector : np.ndarray
- Experimental intensity vector.
- pred_vector : np.ndarray
- Predicted intensity vector.
-
- Returns
- -------
- float
- Pearson correlation coefficient (-1 to 1, higher is better).
-
- Notes
- -----
- If either vector has no variation (std = 0), returns 0.0.
- """
- # Handle vectors with no variation
- ifnp.std(exp_vector)==0ornp.std(pred_vector)==0:
- return0.0
-
- try:
- r,_=pearsonr(exp_vector,pred_vector)
- returnr
- except:
- return0.0
-
-
-
-[docs]
- def_calculate_mean_squared_error(
- self,exp_vector:np.ndarray,pred_vector:np.ndarray
- )->float:
-"""
- Calculate mean squared error between experimental and predicted vectors.
-
- Parameters
- ----------
- exp_vector : np.ndarray
- Experimental intensity vector.
- pred_vector : np.ndarray
- Predicted intensity vector.
-
- Returns
- -------
- float
- Mean squared error (lower is better).
-
- Notes
- -----
- The vectors are normalized using L2 normalization before calculating
- the mean squared error for fair comparison.
- """
- # Normalize vectors for fair comparison using L2 normalization
- exp_norm=self._normalize_vector_l2(exp_vector)
- pred_norm=self._normalize_vector_l2(pred_vector)
-
- returnnp.mean((exp_norm-pred_norm)**2)
-
-
-
-[docs]
- def_calculate_entropy(self,vector:np.ndarray)->float:
-"""
- Calculate Shannon entropy of a vector that has already been sum-1 normalized.
-
- Parameters
- ----------
- vector : np.ndarray
- Input vector, which has already been sum-1 normalized.
-
- Returns
- -------
- float
- Shannon entropy.
-
- Notes
- -----
- Only non-zero probabilities are considered for entropy calculation.
- If all probabilities are zero, returns 0.0.
- """
- # Only consider non-zero probabilities for entropy calculation
- mask=vector>0
- ifnotnp.any(mask):
- return0.0
-
- prob_vector=vector[mask]
- return-np.sum(prob_vector*np.log(prob_vector))
-
-
-
-[docs]
- def_calculate_unweighted_entropy_similarity(
- self,exp_vector:np.ndarray,pred_vector:np.ndarray
- )->float:
-"""
- Calculate unweighted spectral entropy between experimental and predicted vectors.
-
- Parameters
- ----------
- exp_vector : np.ndarray
- Experimental intensity vector.
- pred_vector : np.ndarray
- Predicted intensity vector.
-
- Returns
- -------
- float
- Spectral entropy similarity.
-
- Notes
- -----
- Based on the method described in https://www.nature.com/articles/s41592-021-01331-z.
- The spectral entropy is calculated using the formula:
- 1 - (2*S_PM - S_P - S_M)/ln(4), where S_PM is the entropy of the mixed
- distribution, and S_P and S_M are the entropies of the individual distributions.
- """
- # Sum-to-1 normalization
- sum_normalized_exp_vector=self._normalize_vector_sum(exp_vector)
- sum_normalized_pred_vector=self._normalize_vector_sum(pred_vector)
- s_exp=self._calculate_entropy(sum_normalized_exp_vector)
- s_pred=self._calculate_entropy(sum_normalized_pred_vector)
- s_mixed=self._calculate_entropy(
- 0.5*(sum_normalized_exp_vector+sum_normalized_pred_vector)
- )
-
- # Calculate spectral entropy using formula: 1 - (2*S_PM - S_P - S_M)/ln(4)
- unweighted_entropy_similarity=1.0-(2*s_mixed-s_exp-s_pred)/np.log(4)
-
- returnunweighted_entropy_similarity
-
-
- # TODO: Assign weight to each peak based on the entropy of a spectrum.
- # The original paper uses 3 as a entropy cutoff to assign more weight to the low intensity peaks.
- # But the cutoff value is determined by a small-molecular dataset rather than a proteomics dataset.
- # Thus, we need to design our own heuristic algorithm to calculate a similar feature for proteomics dataset.
- # We can refer to the practice of MSBooster.
- # url: https://github.com/Nesvilab/MSBooster/blame/master/src/main/java/features/spectra/SpectrumComparison.java#L803
-
- # def _calculate_entropy_similarity(self, exp_vector: np.ndarray, pred_vector: np.ndarray) -> float:
-
-
-[docs]
- def_calculate_predicted_counts(
- self,exp_vector:np.ndarray,pred_vector:np.ndarray
- )->Tuple[int,int]:
-"""
- Calculate counts of predicted peaks seen/not seen in experimental spectrum.
-
- Parameters
- ----------
- exp_vector : np.ndarray
- Experimental intensity vector.
- pred_vector : np.ndarray
- Predicted intensity vector.
-
- Returns
- -------
- tuple of (int, int)
- - predicted_seen_nonzero: Number of predicted peaks that are also present in experimental spectrum
- - predicted_not_seen: Number of predicted peaks that are not present in experimental spectrum
- """
- predicted_seen_nonzero=np.sum((pred_vector>0)&(exp_vector>0))
- predicted_not_seen=np.sum((pred_vector>0)&(exp_vector==0))
-
- returnpredicted_seen_nonzero,predicted_not_seen
-[docs]
- defgenerate_features(self)->pd.DataFrame:
-"""
- Public interface for generating spectral similarity features.
-
- Returns
- -------
- pd.DataFrame
- DataFrame containing the generated features.
-
- Notes
- -----
- This method is a wrapper around _generate_features that ensures
- the results are cached and only computed once.
- """
- ifself.resultsisNone:
- self.results=self._generate_features()
- returnself.results[self.id_column+self.feature_columns]
-
-
-
-[docs]
- defget_full_data(self)->pd.DataFrame:
-"""
- Return the full DataFrame with all columns.
-
- Returns
- -------
- pd.DataFrame
- Full DataFrame with all columns.
-
- Notes
- -----
- This method returns the complete DataFrame including all intermediate
- results and raw data used in feature generation.
- """
- returnself.results
-[docs]
-defread_pepxml(pepxml_files,decoy_prefix="DECOY_"):
-"""
- Read PSMs from a list of PepXML files.
-
- Parameters
- ----------
- pepxml_files : Union[str, List[str]]
- The file path to the PepXML file or a list of file paths.
- decoy_prefix : str, optional
- The prefix used to indicate a decoy protein in the description lines
- of the FASTA file. Default is "DECOY_".
-
- Returns
- -------
- PsmContainer
- A PsmContainer object containing the PSM data.
-
- Raises
- ------
- ValueError
- If the PepXML files were generated by Percolator or PeptideProphet.
-
- Notes
- -----
- This function:
- 1. Reads and parses PepXML files
- 2. Calculates mass difference features
- 3. Processes matched ions and complementary ions
- 4. Creates charge columns
- 5. Log-transforms p-values
- 6. Returns a PsmContainer with the processed data
- """
- proton=1.00727646677
- ifisinstance(pepxml_files,str):
- pepxml_files=[pepxml_files]
- psms=pd.concat([_parse_pepxml(f,decoy_prefix)forfinpepxml_files])
-
- # Check that these PSMs are not from Percolator or PeptideProphet:
- illegal_cols={
- "Percolator q-Value",
- "Percolator PEP",
- "Percolator SVMScore",
- }
-
- ifillegal_cols.intersection(set(psms.columns)):
- raiseValueError(
- "The PepXML files appear to have generated by Percolator or "
- "PeptideProphet; hence, they should not be analyzed with mokapot."
- )
-
- # For open modification searches:
- psms["mass_diff"]=psms["exp_mass"]-psms["calc_mass"]
- # Calculate massdiff features
- exp_mz=psms["exp_mass"]/psms["charge"]+proton
- calc_mz=psms["calc_mass"]/psms["charge"]+proton
- psms["abs_mz_diff"]=(exp_mz-calc_mz).abs()
-
- # Calculate matched ions and complementary ions
- if"num_matched_ions"inpsms.columnsand"tot_num_ions"inpsms.columns:
- if(psms["tot_num_ions"]!=0).all():
- psms["matched_ions_ratio"]=psms["num_matched_ions"]/psms["tot_num_ions"]
-
- # Log number of candidates:
- if"num_matched_peptides"inpsms.columns:
- psms["num_matched_peptides"]=np.log10(psms["num_matched_peptides"])
-
- # Create charge columns:
- psms=pd.concat([psms,pd.get_dummies(psms["charge"],prefix="charge")],axis=1)
-
- # psms = psms.drop("charge", axis=1)
- # -log10 p-values
- nonfeat_cols=[
- "ms_data_file",
- "scan",
- "spectrum",
- "label",
- "calc_mass",
- "peptide",
- "proteins",
- "charge",
- "retention_time",
- ]
-
- feat_cols=[cforcinpsms.columnsifcnotinnonfeat_cols]
- psms=psms.apply(_log_features,features=feat_cols)
- rescoring_features={"Original":feat_cols}
-
- returnPsmContainer(
- psms=psms,
- label_column="label",
- scan_column="scan",
- spectrum_column="spectrum",
- ms_data_file_column="ms_data_file",
- peptide_column="peptide",
- protein_column="proteins",
- charge_column="charge",
- rescoring_features=rescoring_features,
- hit_rank_column="rank",
- retention_time_column="retention_time",
- )
-
-
-
-"""
-This code is adapted from 'Mokapot'
-Source: https://github.com/wfondrie/mokapot
-License: Apache License 2.0
-"""
-
-
-
-[docs]
-def_parse_pepxml(pepxml_file,decoy_prefix):
-"""
- Parse the PSMs of a PepXML into a DataFrame.
-
- Parameters
- ----------
- pepxml_file : str
- The PepXML file to parse.
- decoy_prefix : str
- The prefix used to indicate a decoy protein in the description lines
- of the FASTA file.
-
- Returns
- -------
- pandas.DataFrame
- A DataFrame containing the information about each PSM.
-
- Raises
- ------
- ValueError
- If the file is not a PepXML file or is malformed.
- """
- logger.info("Reading %s...",pepxml_file)
- parser=etree.iterparse(str(pepxml_file),tag="{*}msms_run_summary")
- parse_fun=partial(_parse_msms_run,decoy_prefix=decoy_prefix)
- spectra=map(parse_fun,parser)
- try:
- psms=itertools.chain.from_iterable(spectra)
- df=pd.DataFrame.from_records(itertools.chain.from_iterable(psms))
- df["ms_data_file"]=df["ms_data_file"].astype("category")
- exceptetree.XMLSyntaxError:
- raiseValueError(f"{pepxml_file} is not a PepXML file or is malformed.")
- returndf
-
-
-
-
-[docs]
-def_parse_msms_run(msms_run,decoy_prefix):
-"""
- Parse a single MS/MS run.
-
- Parameters
- ----------
- msms_run : tuple of anything, lxml.etree.Element
- The second element of the tuple should be the XML element for a single
- msms_run. The first is not used, but is necessary for compatibility
- with using :code:`map()`.
- decoy_prefix : str
- The prefix used to indicate a decoy protein in the description lines
- of the FASTA file.
-
- Yields
- ------
- dict
- A dictionary describing all of the PSMs in a run.
- """
- msms_run=msms_run[1]
- ms_data_file=msms_run.get("base_name")
- run_ext=msms_run.get("raw_data")
- ifnotms_data_file.endswith(run_ext):
- ms_data_file+=run_ext
-
- run_info={"ms_data_file":ms_data_file}
- forspectruminmsms_run.iter("{*}spectrum_query"):
- yield_parse_spectrum(spectrum,run_info,decoy_prefix)
-
-
-
-
-[docs]
-def_parse_spectrum(spectrum,run_info,decoy_prefix):
-"""
- Parse the PSMs for a single mass spectrum.
-
- Parameters
- ----------
- spectrum : lxml.etree.Element
- The XML element for a single spectrum.
- run_info : dict
- The parsed run data.
- decoy_prefix : str
- The prefix used to indicate a decoy protein in the description lines
- of the FASTA file.
-
- Yields
- ------
- dict
- A dictionary describing all of the PSMs for a spectrum.
- """
- spec_info=run_info.copy()
- spec_info["spectrum"]=str(spectrum.get("spectrum"))
- spec_info["scan"]=int(spectrum.get("end_scan"))
- spec_info["charge"]=int(spectrum.get("assumed_charge"))
- spec_info["retention_time"]=float(spectrum.get("retention_time_sec"))
- spec_info["exp_mass"]=float(spectrum.get("precursor_neutral_mass"))
- forpsmsinspectrum.iter("{*}search_result"):
- forpsminpsms.iter("{*}search_hit"):
- yield_parse_psm(psm,spec_info,decoy_prefix=decoy_prefix)
-
-
-
-
-[docs]
-def_parse_psm(psm_info,spec_info,decoy_prefix):
-"""
- Parse a single PSM.
-
- Parameters
- ----------
- psm_info : lxml.etree.Element
- The XML element containing information about the PSM.
- spec_info : dict
- The parsed spectrum data.
- decoy_prefix : str
- The prefix used to indicate a decoy protein in the description lines
- of the FASTA file.
-
- Returns
- -------
- dict
- A dictionary containing parsed data about the PSM.
- """
- psm=spec_info.copy()
- psm["calc_mass"]=float(psm_info.get("calc_neutral_pep_mass"))
- psm["peptide"]=psm_info.get("peptide")
- psm["proteins"]=[psm_info.get("protein").split(" ")[0]]
- psm["label"]=notpsm["proteins"][0].startswith(decoy_prefix)
- psm["rank"]=int(psm_info.get("hit_rank"))
-
- # Begin features:
- try:
- psm["missed_cleavages"]=int(psm_info.get("num_missed_cleavages"))
- exceptTypeError:
- pass
-
- try:
- psm["ntt"]=int(psm_info.get("num_tol_term"))
- exceptTypeError:
- pass
-
- try:
- psm["num_matched_peptides"]=int(psm_info.get("num_matched_peptides"))
- exceptTypeError:
- pass
-
- try:
- psm["num_matched_ions"]=int(psm_info.get("num_matched_ions"))
- exceptTypeError:
- pass
-
- try:
- psm["tot_num_ions"]=int(psm_info.get("tot_num_ions"))
- exceptTypeError:
- pass
-
- queries=[
- "{*}modification_info",
- "{*}search_score",
- "{*}alternative_protein",
- ]
- forelementinpsm_info.iter(*queries):
- if"modification_info"inelement.tag:
- offset=0
- mod_pep=psm["peptide"]
- formodinelement.iter("{*}mod_aminoacid_mass"):
- idx=offset+int(mod.get("position"))
- mass=mod.get("mass")
- mod_pep=mod_pep[:idx]+"["+mass+"]"+mod_pep[idx:]
- offset+=2+len(mass)
-
- psm["peptide"]=mod_pep
-
- elif"alternative_protein"inelement.tag:
- psm["proteins"].append(element.get("protein").split(" ")[0])
- ifnotpsm["label"]:
- psm["label"]=notpsm["proteins"][-1].startswith(decoy_prefix)
-
- else:
- psm[element.get("name")]=element.get("value")
-
- psm["proteins"]="\t".join(psm["proteins"])
- returnpsm
-
-
-
-
-[docs]
-def_log_features(col,features):
-"""
- Log-transform columns that are p-values or E-values.
-
- Parameters
- ----------
- col : pandas.Series
- A column of the dataset.
- features : list of str
- The features of the dataset. Only feature columns will be considered
- for transformation.
-
- Returns
- -------
- pandas.Series
- The log-transformed values of the column if the feature was determined
- to be a p-value.
-
- Notes
- -----
- This function:
- 1. Detects columns written in scientific notation and log them
- 2. Uses a simple heuristic to find p-value / E-value features
- 3. Only transforms if values span >4 orders of magnitude
- 4. Preserves precision for scientific notation values
- """
- ifcol.namenotinfeatures:
- returncol
- elifcol.dtype=="bool":
- returncol.astype(float)
-
- col=col.astype(str).str.lower()
-
- # Detect columns written in scientific notation and log them:
- # This is specifically needed to preserve precision.
- ifcol.str.contains("e").any()and(col.astype(float)>0).all():
- split=col.str.split("e",expand=True)
- root=split.loc[:,0]
- root=root.astype(float)
- power=split.loc[:,1]
- power[pd.isna(power)]="0"
- power=power.astype(int)
-
- zero_idx=root==0
- root[zero_idx]=1
- power[zero_idx]=power[~zero_idx].min()
- diff=power.max()-power.min()
- ifabs(diff)>=4:
- logger.info(" - log-transformed the '%s' feature.",col.name)
- returnnp.log10(root)+power
- else:
- returncol.astype(float)
-
- col=col.astype(float)
-
- # A simple heuristic to find p-value / E-value features:
- # Non-negative:
- ifcol.min()>=0:
- # Make sure this isn't a binary column:
- ifnotnp.array_equal(col.values,col.values.astype(bool)):
- # Only log if values span >4 orders of magnitude,
- # excluding values that are exactly zero:
- zero_idx=col==0
- col_min=col[~zero_idx].min()
- ifcol.max()/col_min>=10000:
- col[~zero_idx]=np.log10(col[~zero_idx])
- col[zero_idx]=col[~zero_idx].min()-1
- logger.info(" - log-transformed the '%s' feature.",col.name)
-
- returncol
-[docs]
-classPsmContainer:
-"""
- A container for managing peptide-spectrum matches (PSMs) in immunopeptidomics rescoring pipelines.
-
- Parameters
- ----------
- psms : pd.DataFrame
- DataFrame containing the PSM data.
- label_column : str
- Column containing the label (True for target, False for decoy).
- scan_column : str
- Column containing the scan number.
- spectrum_column : str
- Column containing the spectrum identifier.
- ms_data_file_column : str
- Column containing the MS data file that the PSM originated from.
- peptide_column : str
- Column containing the peptide sequence.
- protein_column : str
- Column containing the protein accessions.
- rescoring_features : dict of str to list of str
- Dictionary of feature columns for rescoring.
- hit_rank_column : str, optional
- Column containing the hit rank.
- charge_column : str, optional
- Column containing the charge state.
- retention_time_column : str, optional
- Column containing the retention time.
- metadata_column : str, optional
- Column containing metadata.
-
- Attributes
- ----------
- psms : pd.DataFrame
- Copy of the DataFrame containing the PSM data.
- target_psms : pd.DataFrame
- DataFrame containing only target PSMs (label = True).
- decoy_psms : pd.DataFrame
- DataFrame containing only decoy PSMs (label = False).
- peptides : list of str
- List containing all peptides from the PSM data.
- columns : list of str
- List of column names in the PSM DataFrame.
- rescoring_features : dict of str to list of str
- Dictionary of rescoring feature columns in the PSM DataFrame.
- """
-
- def__init__(
- self,
- psms:pd.DataFrame,
- label_column:str,
- scan_column:str,
- spectrum_column:str,
- ms_data_file_column:str,
- peptide_column:str,
- protein_column:str,
- rescoring_features:Dict[str,List[str]],
- hit_rank_column:Optional[str]=None,
- charge_column:Optional[int]=None,
- retention_time_column:Optional[str]=None,
- metadata_column:Optional[str]=None,
- ):
- self._psms=psms.copy()
- self._psms.reset_index(drop=True,inplace=True)
- self.label_column=label_column
- self.scan_column=scan_column
- self.spectrum_column=spectrum_column
- self.ms_data_file_column=ms_data_file_column
- self.peptide_column=peptide_column
- self.protein_column=protein_column
- self.hit_rank_column=hit_rank_column
- self.retention_time_column=retention_time_column
- self.metadata_column=metadata_column
- self.rescoring_features=rescoring_features
- self.charge_column=charge_column
- # rescore result column
- self.rescore_result_column=None
-
- # check if the columns are in the dataframe
- defcheck_column(col):
- ifcolandcolnotinpsms.columns:
- raiseValueError(f"Column '{col}' not found in PSM data.")
-
- check_column(label_column)
- check_column(scan_column)
- check_column(spectrum_column)
- check_column(ms_data_file_column)
- check_column(peptide_column)
- check_column(protein_column)
- check_column(hit_rank_column)
- check_column(retention_time_column)
- check_column(charge_column)
-
- ifpsms[label_column].nunique()==1andpsms[label_column].iloc[0]==True:
- raiseValueError("All PSMs are labeled as target. No decoy PSMs found.")
- elifpsms[label_column].nunique()==1andpsms[label_column].iloc[0]==False:
- raiseValueError("All PSMs are labeled as decoy. No target PSMs found.")
-
- defcheck_metadata_column(col):
- # check the type is Dict[str, Dict[str, str]]
- ifcolandcolnotinpsms.columns:
- raiseValueError(f"Column '{col}' not found in PSM data.")
- ifnotall(isinstance(x,dict)forxinself._psms[col]):
- raiseValueError(f"Column '{col}' must contain dictionaries.")
-
- ifmetadata_column:
- check_metadata_column(metadata_column)
-
- defcheck_rescoring_features(features:Dict[str,List[str]]):
- forkey,colsinfeatures.items():
- forcolincols:
- ifcolnotinpsms.columns:
- raiseValueError(
- f"Column '{col}' not found in PSM data for feature '{key}'."
- )
-
- check_rescoring_features(rescoring_features)
-
- # check if the number of decoy psms is not 0
- iflen(self.decoy_psms)==0:
- logger.error("No decoy PSMs found. Please check the decoy prefix.")
- raiseValueError("No decoy PSMs found.")
-
- logger.info("PsmContainer initialized with %d PSM entries.",len(self._psms))
- ifself.ms_data_file_column:
- logger.info(
- "PSMs originated from %d MS data file(S).",
- len(self._psms[ms_data_file_column].unique()),
- )
- logger.info("target psms: %d",len(self.target_psms))
- logger.info("decoy psms: %d",len(self.decoy_psms))
- logger.info("unique peptides: %d",len(np.unique(self.peptides)))
- logger.info("rescoing features: %s",rescoring_features)
-
-
- @property
- defpsms(self)->pd.DataFrame:
-"""
- Get a copy of the PSM DataFrame to prevent external modification.
-
- Returns
- -------
- pd.DataFrame
- A copy of the PSM DataFrame.
- """
- # TODO: return in a specific column order
- returnself._psms.copy()
-
-
-[docs]
- def__len__(self)->int:
-"""
- Get the number of PSMs in the container.
-
- Returns
- -------
- int
- Number of PSMs.
- """
- returnlen(self._psms)
-
-
- @property
- deftarget_psms(self)->pd.DataFrame:
-"""
- Get a DataFrame containing only target PSMs.
-
- Returns
- -------
- pd.DataFrame
- DataFrame with only target PSMs (label = True).
- """
- returnself._psms[self._psms[self.label_column]==True].copy()
-
- @property
- defdecoy_psms(self)->pd.DataFrame:
-"""
- Get a DataFrame containing only decoy PSMs.
-
- Returns
- -------
- pd.DataFrame
- DataFrame with only decoy PSMs (label = False).
- """
- returnself._psms[self._psms[self.label_column]==False].copy()
-
- @property
- defcolumns(self)->List[str]:
-"""
- Get the column names of the PSM DataFrame.
-
- Returns
- -------
- list of str
- List of column names.
- """
- returnlist(self._psms.columns)
-
- @property
- deffeature_columns(self)->List[str]:
-"""
- Get a list of all feature columns in the PSM DataFrame.
-
- Returns
- -------
- list of str
- List of feature column names.
- """
- return[colforcolsinself.rescoring_features.values()forcolincols]
-
- @property
- deffeature_sources(self)->List[str]:
-"""
- Get a list of all feature sources in the PSM DataFrame.
-
- Returns
- -------
- list of str
- List of feature source names.
- """
- returnlist(self.rescoring_features.keys())
-
- @property
- defpeptides(self)->List[str]:
-"""
- Get the peptide sequences from the PSM data.
-
- Returns
- -------
- list of str
- List of peptide sequences.
- """
- returnself._psms[self.peptide_column].tolist()
-
- @property
- defms_data_files(self)->List[str]:
-"""
- Get the MS data files from the PSM data.
-
- Returns
- -------
- list of str
- List of MS data file names.
- """
- returnself._psms[self.ms_data_file_column].tolist()
-
- @property
- defscan_ids(self)->List[int]:
-"""
- Get the scan numbers from the PSM data.
-
- Returns
- -------
- list of int
- List of scan numbers.
- """
- returnself._psms[self.scan_column].tolist()
-
- @property
- defcharges(self)->List[int]:
-"""
- Get the charge states from the PSM data.
-
- Returns
- -------
- list of int
- List of charge states.
- """
- returnself._psms[self.charge_column].tolist()
-
- @property
- defmetadata(self)->pd.Series:
-"""
- Get the metadata from the PSM data.
-
- Returns
- -------
- pd.Series
- Series containing metadata for each PSM.
- """
- returnself._psms[self.metadata_column].copy()
-
- @property
- defspectrum_ids(self)->List[str]:
-"""
- Get the spectrum identifiers from the PSM data.
-
- Returns
- -------
- list of str
- List of spectrum identifiers.
- """
- returnself._psms[self.spectrum_column].tolist()
-
- @property
- defidentifier_columns(self)->List[str]:
-"""
- Get the columns that uniquely identify each PSM.
-
- Returns
- -------
- list of str
- List of identifier column names.
- """
- return[
- self.scan_column,
- self.spectrum_column,
- self.peptide_column,
- self.protein_column,
- ]
-
-
-[docs]
- defcopy(self)->"PsmContainer":
-"""
- Return a deep copy of the PsmContainer object.
-
- Returns
- -------
- PsmContainer
- A deep copy of the current PsmContainer.
- """
- importcopy
-
- returncopy.deepcopy(self)
-[docs]
- defdrop_features(self,features:List[str])->None:
-"""
- Drop specified features from the PSM DataFrame.
-
- Parameters
- ----------
- features : list of str
- List of feature column names to drop.
-
- Raises
- ------
- ValueError
- If any of the features do not exist in the DataFrame.
- """
- missing_features=[fforfinfeaturesiffnotinself._psms.columns]
- ifmissing_features:
- raiseValueError(f"Features not found in PSM data: {missing_features}")
-
- self._psms.drop(columns=features,inplace=True)
- # Create a list of sources to update
- sources_to_update=[]
- forsource,colsinself.rescoring_features.items():
- self.rescoring_features[source]=[
- colforcolincolsifcolnotinfeatures
- ]
- ifnotself.rescoring_features[source]:
- sources_to_update.append(source)
-
- logger.info(
- f"Sources to be removed: {sources_to_update}. Because all the features are removed."
- )
- # Remove sources with no features left
- forsourceinsources_to_update:
- delself.rescoring_features[source]
-
-
-
-[docs]
- defdrop_source(self,source:str)->None:
-"""
- Drop all features associated with a specific source from the PSM DataFrame.
-
- Parameters
- ----------
- source : str
- Name of the source to drop.
-
- Raises
- ------
- ValueError
- If the source does not exist in the rescoring features.
- """
- ifsourcenotinself.rescoring_features:
- raiseValueError(f"Source '{source}' not found in rescoring features.")
- self.drop_features(self.rescoring_features[source])
-
-
-
-[docs]
- defadd_metadata(
- self,
- metadata_df:pd.DataFrame,
- psms_key:Union[str,List[str]],
- metadata_key:Union[str,List[str]],
- source,
- )->None:
-"""
- Merge new metadata into the PSM DataFrame based on specified columns.
- Metadata from the specified source is stored as a nested dictionary inside the metadata column.
-
- Parameters
- ----------
- metadata_df : pd.DataFrame
- DataFrame containing new metadata to add.
- psms_key : str or list of str
- Column name(s) in the PSM data to merge on.
- metadata_key : str or list of str
- Column name(s) in the metadata data to merge on.
- source : str
- Name of the source of the new metadata.
- """
- ifself.metadata_columnisNone:
- logger.info("No existing metadata column. Creating new metadata column.")
- self.metadata_column="metadata"
- self._psms["metadata"]=[{}for_inrange(len(self._psms))]
-
- metadata_cols=[colforcolinmetadata_df.columnsifcolnotinmetadata_key]
- merged_df=self.psms.merge(
- metadata_df,left_on=psms_key,right_on=metadata_key,how="left"
- )
- ifsourceinself._psms["metadata"]:
- logger.warning(f"{source} already exists in metadata. Overwriting.")
- forcolinmetadata_cols:
- merged_df["metadata"]=merged_df.apply(
- lambdarow:{
- **row["metadata"],
- source:(
- {col:row[col]}
- ifsourcenotinrow["metadata"]
- else{**row["metadata"][source],col:row[col]}
- ),
- },
- axis=1,
- )
-
- self._psms["metadata"]=merged_df["metadata"]
-
-
-
-[docs]
- defget_top_hits(self,n:int=1):
-"""
- Get the top n hits based on the hit rank column.
- If the hit rank column is not specified, returns the original PSMs.
-
- Parameters
- ----------
- n : int, optional
- The number of top hits to return. Default is 1.
-
- Returns
- -------
- PsmContainer
- A new PsmContainer object containing the top n hits.
- """
- ifself.hit_rank_columnisNone:
- logger.warning("Rank column not specified. Return the original PSMs.")
- returnself.copy()
-
- psms=self.copy()
- psms._psms=psms._psms[psms._psms[self.hit_rank_column]<=n]
- returnpsms
-
-
-
-[docs]
- defadd_features(
- self,
- features_df:pd.DataFrame,
- psms_key:Union[str,List[str]],
- feature_key:Union[str,List[str]],
- source:str,
- suffix:Optional[str]=None,
- )->None:
-"""Merge new features into the PSM DataFrame based on specified columns.
-
- This method performs a left join between the PSM data and feature data,
- ensuring that all PSMs are preserved while adding new features. It handles
- column name conflicts through optional suffixing and maintains feature source
- tracking.
-
- Parameters
- ----------
- features_df : pd.DataFrame
- DataFrame containing new features to add.
- psms_key : str or list of str
- Column name(s) in the PSM data to merge on.
- feature_key : str or list of str
- Column name(s) in the features data to merge on.
- source : str
- Name of the source of the new features (e.g., 'deeplc', 'netmhc').
- suffix : str, optional
- Suffix to add to the new columns if there's a name conflict.
- Required when new feature columns have the same names as existing columns.
- For example, if adding features from different sources (e.g., 'score' from
- DeepLC and NetMHC), use suffixes like '_deeplc' or '_netmhc' to distinguish them.
-
- Returns
- -------
- None
-
- Raises
- ------
- ValueError
- If duplicate columns exist without suffix.
- If merging features changes the number of PSMs.
-
- Notes
- -----
- The method follows these steps:
- 1. Validates input and prepares merge keys
- 2. Checks for column name conflicts
- 3. Manages feature source: if the source already exists, it will be overwritten
- 4. Performs left join merge
- 5. Verifies data integrity
-
- Suffix Usage
- -----------
- The suffix parameter is used to handle column name conflicts:
- - When adding features from different sources that might have the same column names
- - When you want to keep both the original and new features with the same name
- - When you need to track the source of features in the column names
-
- If suffix is not provided and there are duplicate column names:
- - The method will raise a ValueError
- - You must either provide a suffix or rename the columns before adding
-
- Examples
- --------
- >>> container = PsmContainer(...)
- >>> # Adding features without suffix (no conflicts)
- >>> features_df1 = pd.DataFrame({
- ... 'scan': [1, 2, 3],
- ... 'feature1': [0.1, 0.2, 0.3],
- ... 'feature2': [0.4, 0.5, 0.6]
- ... })
- >>> container.add_features(
- ... features_df1,
- ... psms_key='scan',
- ... feature_key='scan',
- ... source='source1'
- ... )
- >>> # Adding features with suffix (handling conflicts)
- >>> features_df2 = pd.DataFrame({
- ... 'scan': [1, 2, 3],
- ... 'score': [0.8, 0.9, 0.7], # This would conflict with existing 'score'
- ... 'feature3': [0.7, 0.8, 0.9]
- ... })
- >>> container.add_features(
- ... features_df2,
- ... psms_key='scan',
- ... feature_key='scan',
- ... source='source2',
- ... suffix='_new' # 'score' becomes 'score_new'
- ... )
- """
- ifisinstance(psms_key,str):
- psms_key=[psms_key]
-
- ifisinstance(feature_key,str):
- feature_key=[feature_key]
-
- new_feature_cols=[
- colforcolinfeatures_df.columnsifcolnotinfeature_key
- ]
-
- forcolsinnew_feature_cols:
- ifcolsinself._psms.columns:
- logger.warning(f"Column '{cols}' already exists in PSM data.")
- ifsuffixisNone:
- logger.warning("No suffix provided. Using default suffix ")
- raiseValueError("Duplicate columns exist. No suffix provided.")
- else:
- logger.warning(
- f"Suffix '{suffix}' provided. Using suffix '{suffix}'."
- )
- logger.info(f"Adding {len(new_feature_cols)} new features from {source}.")
-
- ifnotnew_feature_cols:
- logger.warning("No new features to add. Check the feature key and PSMs key.")
- logger.warning(f"Feature key: {feature_key}; PSMs key: {psms_key}")
-
- ifsourceinself.rescoring_features:
- logger.warning(
- f"{source} already exists in rescoring features. Overwriting."
- )
- self.drop_source(source)
-
- # TODO: reluctant logic
- ifsuffixisNone:
- suffixes=("","")
- else:
- suffixes=("",suffix)
-
- self.rescoring_features[source]=[
- col+suffixes[1]forcolinnew_feature_cols
- ]
- features_df=features_df.rename(
- columns={col:col+suffixes[1]forcolinnew_feature_cols}
- )
- original_len=len(self._psms)
- # avoid merge the right key to the psms
- self._psms=self._psms.merge(
- features_df,left_on=psms_key,right_on=feature_key,how="left"
- )
-
- iffeature_key!=psms_key:
- cols_to_drop=[
- col
- forcolinfeature_key
- ifcolnotinpsms_keyandcolinself._psms.columns
- ]
- ifcols_to_drop:
- logger.debug(
- f"Dropping columns from feature_key not in psms_key: {cols_to_drop}"
- )
- self._psms.drop(columns=cols_to_drop,inplace=True)
-
- iflen(self._psms)!=original_len:
- raiseValueError(
- "Merging features resulted in a change in the number of PSMs. Check for duplicate keys."
- )
-
-
-
-[docs]
- defadd_features_by_index(
- self,features_df:pd.DataFrame,source:str,suffix:Optional[str]=None
- )->None:
-"""
- Merge new features into the PSM DataFrame based on the DataFrame index.
-
- Parameters
- ----------
- features_df : pd.DataFrame
- DataFrame containing new features to add.
- source : str
- Name of the source of the new features.
- suffix : str, optional
- Suffix to add to the new columns if there's a name conflict.
- """
- new_feature_cols=[colforcolinfeatures_df.columns]
- forcolinnew_feature_cols:
- ifcolinself._psms.columns:
- logger.warning(f"Column '{col}' already exists in PSM data.")
- ifsuffixisNone:
- logger.warning("No suffix provided. Using default suffix.")
- raiseValueError("Duplicate columns exist. No suffix provided.")
- else:
- logger.warning(
- f"Suffix '{suffix}' provided. Using suffix '{suffix}'."
- )
-
- logger.info(
- f"Adding {len(new_feature_cols)} new features from {source} by index."
- )
-
- ifnotnew_feature_cols:
- logger.warning("No new features to add.")
- raiseValueError("No new features to add.")
-
- ifsourceinself.rescoring_features:
- logger.warning(
- f"{source} already exists in rescoring features. Overwriting."
- )
- self.drop_source(source)
-
- ifsuffixisNone:
- suffixes=("","")
- else:
- suffixes=("",suffix)
-
- self.rescoring_features[source]=[
- col+suffixes[1]forcolinnew_feature_cols
- ]
- features_df.rename(
- columns={col:col+suffixes[1]forcolinnew_feature_cols},inplace=True
- )
- original_len=len(self._psms)
- self._psms=self._psms.merge(
- features_df,
- left_index=True,
- right_index=True,
- how="left",# Perform a left join to preserve all original PSM data
- )
-
- # Ensure that the merge did not change the number of rows in the PSM DataFrame
- iflen(self._psms)!=original_len:
- raiseValueError(
- "Merging features resulted in a change in the number of PSMs. Check for duplicate indices."
- )
-
-
-
-[docs]
- defadd_results(
- self,
- results_df:pd.DataFrame,
- psms_key:Union[str,List[str]],
- result_key:Union[str,List[str]],
- )->None:
-"""
- Add results of rescore engine to the PSM DataFrame based on specified columns.
-
- Parameters
- ----------
- results_df : pd.DataFrame
- DataFrame containing new results to add.
- psms_key : str or list of str
- Column name(s) in the PSM data to merge on.
- result_key : str or list of str
- Column name(s) in the results data to merge on.
- """
- ifself.rescore_result_columnisnotNone:
- logger.warning("Rescore result column already exists. Overwriting.")
-
- ifset(self._psms.columns)&set(results_df.columns):
- raiseValueError(
- "Duplicate columns exist. Please rename the columns in the results data."
- )
-
- self.rescore_result_column=result_key
- self._psms=self._psms.merge(
- results_df,
- left_on=psms_key,
- right_on=result_key,
- how="left",
- validate="one_to_one",
- )
- self._psms.drop(columns=result_key,inplace=True)
- logger.info(f"Added rescore results to PSM data.")
-
-
-
-[docs]
- defwrite_pin(self,output_file:str,source:List[str]=None)->None:
-"""
- Write the PSM data to a Percolator input (PIN) file.
-
- Percolator accepts input in a simple tab-delimited format where each row contains features associated with a single PSM:
- id <tab> label <tab> scannr <tab> feature1 <tab> ... <tab> featureN <tab> peptide <tab> proteinId1 <tab> .. <tab> proteinIdM
- With header:
- SpecID <tab> Label <tab> ScanNr <tab> Feature1 <tab> ... <tab> FeatureN <tab> Peptide <tab> Proteins
-
- Parameters
- ----------
- output_file : str
- The path to the output PIN file.
- source : list of str, optional
- List of feature sources to include. If None, include all sources.
-
- Returns
- -------
- pd.DataFrame
- The DataFrame written to the PIN file.
- """
- df=self._psms.copy()
- # Check if the label column is str
- # Case1: label column is str
- ifdf[self.label_column].dtype=="str":
- df["PercolatorLabel"]=df[self.label_column].map({"True":1,"False":-1})
- # Case2: label column is bool
- elifdf[self.label_column].dtype=="bool":
- df["PercolatorLabel"]=df[self.label_column].map({True:1,False:-1})
- else:
- # try to convert to bool
- logger.warning("Label column is not str or bool. Converting to bool.")
- df["PercolatorLabel"]=(
- df[self.label_column].astype(bool).map({True:1,False:-1})
- )
- logger.info("Writing PIN file to %s",output_file)
-
- feature_cols=[]
- ifsourceisNone:
- for_,colsinself.rescoring_features.items():
- feature_cols.extend(cols)
- else:
- forsinsource:
- ifsnotinself.rescoring_features:
- raiseValueError(f"Source '{s}' not found in rescoring features.")
- feature_cols.extend(self.rescoring_features[s])
-
- pin_df=pd.DataFrame()
- pin_df["SpecID"]=df[self.spectrum_column]
- pin_df["Label"]=df["PercolatorLabel"]
- pin_df["ScanNr"]=df[self.scan_column]
- forcolinfeature_cols:
- pin_df[col]=df[col]
-
- pin_df["Peptide"]=df[self.peptide_column]
- pin_df["Proteins"]=df[self.protein_column].apply(
- lambdax:";".join(x)ifisinstance(x,(list,tuple))elsex
- )
- pin_df.to_csv(output_file,sep="\t",index=False)
- logger.info("PIN file written to %s",output_file)
-
- returnpin_df
-[docs]
-defconvert_pfm_to_pwm(pfm_filename,pseudocount=0.8,background_freqs=None):
-"""
- Convert a Position Frequency Matrix (PFM) file to a Position Weight Matrix (PWM).
-
- Parameters
- ----------
- pfm_filename : str
- The file path to the PFM file.
- pseudocount : float, optional
- The pseudocount to add to the PFM to avoid zero probabilities. Default is 0.8.
- background_freqs : dict, optional
- Dictionary containing the background frequencies for each amino acid.
- If None, uses 1/20 for all.
-
- Returns
- -------
- pd.DataFrame
- DataFrame representation of the PWM.
-
- Notes
- -----
- The conversion process involves:
- 1. Adding pseudocounts to the PFM
- 2. Converting to Position Probability Matrix (PPM)
- 3. Converting to PWM using log2(PPM/background_freqs)
- """
- # Default background frequencies if not provided
- ifbackground_freqsisNone:
- background_freqs=1/20
-
- pfm=pd.read_csv(pfm_filename,sep="\t",header=None,index_col=0)
- pfm.drop(pfm.columns[-1],axis=1,inplace=True)# Drop any extraneous columns
- pfm_pseudo=pfm+pseudocount
- ppm=pfm_pseudo.div(pfm_pseudo.sum(axis=0),axis=1)
- pwm=np.log2(ppm.div(list(background_freqs.values())))
-
- returnpwm
-
-
-
-
-[docs]
-defremove_pre_and_nxt_aa(peptide:str)->str:
-"""
- Remove the pre and next amino acids from a peptide sequence.
-
- Parameters
- ----------
- peptide : str
- The peptide sequence with flanking amino acids.
- Example: '.AANDAGYFNDEMAPIEVKTK.'
-
- Returns
- -------
- str
- The peptide sequence with flanking amino acids removed.
- Example: 'AANDAGYFNDEMAPIEVKTK'
-
- Notes
- -----
- This function removes any amino acids before the first '.' and after the last '.'
- in the peptide sequence.
- """
- importre
-
- returnre.sub(r"^[^.]*\.|\.[^.]*$","",peptide)
-
-
-
-
-[docs]
-defremove_modifications(peptide:str,keep_modification=None)->str:
-"""
- Remove modifications from a peptide sequence, with an option to keep specific modifications.
-
- Parameters
- ----------
- peptide : str
- The peptide sequence with modifications in brackets.
- Example: 'AANDAGYFNDEM[15.9949]APIEVK[42.0106]TK'
- keep_modification : str or list, optional
- The modification(s) to keep. If provided, only these modifications will be
- preserved in the output sequence. Default is None.
-
- Returns
- -------
- str
- The peptide sequence with modifications removed or kept.
- Example: 'AANDAGYFNDEMAPIEVKTK' (if keep_modification is None)
- Example: 'AANDAGYFNDEM[15.9949]APIEVKTK' (if keep_modification=['15.9949'])
-
- Notes
- -----
- Modifications are specified in square brackets after the amino acid.
- If keep_modification is provided, only those specific modifications will be
- preserved in the output sequence.
- """
- importre
-
- ifkeep_modificationisNone:
- returnre.sub(r"\[.*?\]","",peptide)
- else:
- ifnotisinstance(keep_modification,list):
- keep_modification=[keep_modification]
-
- defreplace_mod(match):
- mod=match.group(0)
- ifany(keepinmodforkeepinkeep_modification):
- returnmod
- return""
-
- returnre.sub(r"\[.*?\]",replace_mod,peptide)
-
-
-
-
-[docs]
-defpreprocess_peptide(peptide:str)->str:
-"""
- Preprocess the peptide sequence by removing flanking regions and modifications.
-
- Parameters
- ----------
- peptide : str
- Original peptide sequence with possible flanking regions and modifications.
- Example: '.AANDAGYFNDEM[15.9949]APIEVK[42.0106]TK.'
-
- Returns
- -------
- str
- Cleaned peptide sequence without flanking regions and modifications.
- Example: 'AANDAGYFNDEMAPIEVKTK'
-
- Notes
- -----
- This function performs two operations in sequence:
- 1. Removes flanking amino acids using remove_pre_and_nxt_aa
- 2. Removes all modifications using remove_modifications
- """
- peptide=remove_pre_and_nxt_aa(peptide)
- peptide=remove_modifications(peptide)
- returnpeptide
-
-
-
-
-[docs]
-deflist_all_files_in_directory(directory_path:str)->List[str]:
-"""
- Retrieve all files in the specified directory and return a list of file paths.
-
- Parameters
- ----------
- directory_path : str
- The path to the directory to search in.
- Example: '/path/to/directory'
-
- Returns
- -------
- list of str
- List of absolute file paths found in the directory and its subdirectories.
- Example: ['/path/to/directory/file1.txt', '/path/to/directory/subdir/file2.txt']
-
- Notes
- -----
- This function recursively searches through all subdirectories and returns
- absolute paths for all files found.
- """
- path=Path(directory_path)
- file_list=[str(file)forfileinpath.rglob("*")iffile.is_file()]
- returnfile_list
-
-
-
-
-[docs]
-defextract_unimod_from_peptidoform(peptide:str,mod_dict:dict)->tuple:
-"""
- Convert a modified peptide sequence into DeepLC format.
-
- Parameters
- ----------
- peptide : str
- The input peptide sequence with modifications in brackets.
- Example: 'AANDAGYFNDEM[15.9949]APIEVK[42.0106]TK'
- mod_dict : dict
- Dictionary mapping modification names (in peptide) to corresponding Unimod names.
- Example: {'15.9949': 'Oxidation', '42.0106': 'Acetyl'}
-
- Returns
- -------
- tuple
- (seq, modifications):
- seq : str
- The unmodified peptide sequence.
- modifications : str
- String of modifications formatted as `position|UnimodName`, separated by pipes `|`.
- """
- clean_sequence=""
- modifications=[]
- current_position=0
- i=0
- whilei<len(peptide):
- ifpeptide[i]=="[":
- end=peptide.find("]",i)
- ifend==-1:
- raiseValueError(
- f"Invalid modification format in {peptide}: missing closing bracket."
- )
- mod_name=peptide[i+1:end]
- ifmod_namenotinmod_dict:
- raiseValueError(
- f"Modification '{mod_name}' not found in the dictionary."
- )
- modifications.append(f"{current_position}|{mod_dict[mod_name]}")
- i=end+1
- else:
- clean_sequence+=peptide[i]
- current_position+=1
- i+=1
-
- modification_str="|".join(modifications)
- logger.debug(f"Original peptide: {peptide}")
- logger.debug(f"Output clean_sequence: {clean_sequence}")
- logger.debug(f"Output modifications: {modification_str}")
- returnclean_sequence,modification_str
-
-
-
-
-[docs]
-defconvert_to_unimod_format(peptide:str,mod_dict:dict)->str:
-"""
- Convert a modified peptide sequence into Unimod format.
-
- Parameters
- ----------
- peptide : str
- The input peptide sequence with modifications in brackets.
- Example: 'AANDAGYFNDEM[15.9949]APIEVK[42.0106]TK'
- mod_dict : dict
- Dictionary mapping modification names (in peptide) to corresponding Unimod names.
- Example: {'15.9949': 'UNIMOD:4', '42.0106': 'UNIMOD:1'}
-
- Returns
- -------
- str
- The peptide sequence formatted for Unimod.
- Example: 'AANDAGYFNDEM[UNIMOD:4]APIEVK[UNIMOD:1]TK'
-
- Notes
- -----
- This function replaces the modification names in brackets with their
- corresponding Unimod identifiers while preserving the peptide sequence
- structure.
- """
- res=peptide
- forkey,valueinmod_dict.items():
- res=res.replace(f"[{key}]",f"[{value}]")
- logger.debug(f"Original peptide: {peptide}")
- logger.debug(f"Output peptide: {res}")
- returnres
-[docs]
-defplot_feature_importance(
- models,rescoring_features,save_path=None,sort=False,error=False,**kwargs
-):
-"""
- Unified function to plot average feature importance across multiple models.
-
- This function supports:
- - Linear models (e.g., Linear SVR) which provide an 'estimator' attribute with a 'coef_'.
- The absolute value of the coefficients is used for importance, and hatch patterns are applied
- to differentiate between positive and negative coefficients.
- - XGBoost models which provide a 'feature_importances_' attribute. Since these values are
- always positive, no hatch patterns are applied.
-
- Parameters
- ----------
- models : list
- A list of model objects.
- For linear models, each model should have an 'estimator' with 'coef_'.
- For XGBoost models, each model should have a 'feature_importances_' attribute.
- rescoring_features : dict
- A dictionary where keys are sources and values are lists of features.
- save_path : str, optional
- If provided, saves the plot to the specified path.
- sort : bool, optional
- If True, sorts the features by their importance in descending order.
- Default is False.
- error : bool, optional
- If True, adds error bars to the plot. Default is False.
- **kwargs : dict
- Additional plotting parameters such as 'figsize' and 'dpi'.
-
- Notes
- -----
- The function automatically detects the model type based on the presence of the corresponding attribute.
- For linear models, it uses hatch patterns to differentiate between positive and negative coefficients.
- For XGBoost models, it uses solid bars since the importances are always positive.
- """
- # Determine the model type based on the first model in the list.
- ifhasattr(models[0].estimator,"coef_"):
- model_type="linear"
- elifhasattr(models[0].estimator,"feature_importances_"):
- model_type="xgb"
- else:
- raiseValueError(
- "Model type not recognized. Model must have 'estimator.coef_' for linear models or "
- "'estimator.feature_importances_' for XGBoost models."
- )
-
- ifmodel_type=="linear":
- feature_importances=[]
- formodelinmodels:
- coefficients=model.estimator.coef_
- feature_importances.append(np.abs(coefficients).mean(axis=0))
- logger.debug(f"Model coefficients shape: {coefficients.shape}")
-
- average_feature_importance=np.mean(feature_importances,axis=0)
- std_feature_importance=np.std(feature_importances,axis=0)
- feature_signs=np.mean(
- [model.estimator.coef_.mean(axis=0)formodelinmodels],axis=0
- )
-
- elifmodel_type=="xgb":
- feature_importances=[]
- formodelinmodels:
- # Use the XGBoost feature importances directly as they are always positive
- imp=model.estimator.feature_importances_
- feature_importances.append(imp)
- logger.debug(f"Model feature importances shape: {imp.shape}")
-
- average_feature_importance=np.mean(feature_importances,axis=0)
- std_feature_importance=np.std(feature_importances,axis=0)
- feature_signs=np.ones_like(average_feature_importance)
-
- logger.debug(
- f"Total rescoring features: {len(sum(rescoring_features.values(),[]))}"
- )
- logger.debug(
- f"Average feature importance length: {len(average_feature_importance)}"
- )
- logger.debug(f"Features: {sum(rescoring_features.values(),[])}")
-
- figsize=kwargs.get("figsize",(15,10))
- dpi=kwargs.get("dpi",300)
-
- all_features=[]
- all_importances=[]
- all_errors=[]
- all_colors=[]
- all_hatches=[]# Hatch patterns will be applied only for linear models.
-
- color_cycle=cycle(plt.cm.tab10.colors)
-
- forsource,featuresinrescoring_features.items():
- color=next(color_cycle)
- indices=[
- i
- fori,nameinenumerate(sum(rescoring_features.values(),[]))
- ifnameinfeatures
- ]
- source_importances=average_feature_importance[indices]
- source_std=std_feature_importance[indices]
-
- ifmodel_type=="linear":
- source_signs=feature_signs[indices]
-
- ifsort:
- sorted_indices=np.argsort(-source_importances)
- else:
- sorted_indices=np.arange(len(features))
-
- sorted_features=[features[i]foriinsorted_indices]
- sorted_importances=source_importances[sorted_indices]
- sorted_std=source_std[sorted_indices]
-
- all_features.extend(sorted_features)
- all_importances.extend(sorted_importances)
- all_errors.extend(sorted_std)
- all_colors.extend([color]*len(sorted_features))
-
- ifmodel_type=="linear":
- # For linear models, use hatch patterns to differentiate positive and negative coefficients.
- # An empty hatch ('') for positive and '\\' for negative coefficients.
- sorted_signs=source_signs[sorted_indices]
- all_hatches.extend([""ifsign>=0else"\\\\"forsigninsorted_signs])
- else:
- all_hatches.extend([""]*len(sorted_features))
-
- fig,ax=plt.subplots(figsize=figsize,dpi=dpi)
- iferror:
- bars=ax.barh(
- all_features,all_importances,xerr=all_errors,color=all_colors,capsize=5
- )
- else:
- bars=ax.barh(all_features,all_importances,color=all_colors)
-
- ifmodel_type=="linear":
- forbar,hatchinzip(bars,all_hatches):
- bar.set_hatch(hatch)
- legend_hatches=[
- Patch(facecolor="white",edgecolor="black",hatch="",label="Positive"),
- Patch(facecolor="white",edgecolor="black",hatch="\\\\",label="Negative"),
- ]
- legend_colors=[
- Patch(facecolor=color,edgecolor="black",label=source)
- forcolor,sourceinzip(plt.cm.tab10.colors,rescoring_features.keys())
- ]
- ax.legend(handles=legend_hatches+legend_colors,loc="best")
- else:
- legend_colors=[
- Patch(facecolor=color,edgecolor="black",label=source)
- forcolor,sourceinzip(plt.cm.tab10.colors,rescoring_features.keys())
- ]
- ax.legend(handles=legend_colors,loc="best")
-
- ax.set_xlabel("Average Feature Importance")
- ax.set_ylabel("Feature")
-
- save_or_show_plot(save_path,logger)
-
-
-
-
-[docs]
-defvisualize_feature_correlation(psms:PsmContainer,save_path=None,**kwargs):
-"""
- Visualize the correlation between features in a DataFrame using a heatmap.
-
- Parameters
- ----------
- psms : PsmContainer
- A PsmContainer object containing the features to visualize.
- save_path : str, optional
- The file path to save the plot. If not provided, the plot is displayed.
- **kwargs : dict
- Additional plotting parameters such as `figsize` and `dpi`, etc.
-
- Notes
- -----
- This function:
- 1. Extracts all rescoring features from the PsmContainer
- 2. Calculates the correlation matrix between features
- 3. Creates a heatmap visualization of the correlations
- 4. Uses a coolwarm colormap to show positive and negative correlations
- """
- figsize=kwargs.get("figsize",(40,36))
- dpi=kwargs.get("dpi",300)
-
- rescoring_features=[
- itemforsublistinpsms.rescoring_features.values()foriteminsublist
- ]
- fig,ax=plt.subplots(figsize=figsize,dpi=dpi)
- corr=psms.psms[rescoring_features].corr()
- # sns.heatmap(corr, annot=True, fmt=".2f", cmap='coolwarm', ax=ax)
- sns.heatmap(corr,cmap="coolwarm",ax=ax)
- ax.set_title("Feature Correlation Heatmap")
-
- save_or_show_plot(save_path,logger)
-[docs]
-defplot_qvalues(
- results,
- save_path=None,
- dpi=300,
- figsize=(15,10),
- threshold=0.05,
- colors=None,
- **kwargs,
-):
-"""
- Plot q-values for the given results.
-
- Parameters
- ----------
- results : object or list
- A list of results objects or a single result object.
- Each result object should have a method `plot_qvalues`.
- save_path : str, optional
- If provided, saves the plot to the specified path.
- dpi : int, optional
- The resolution of the plot. Default is 300.
- figsize : tuple, optional
- The size of the figure. Default is (15, 10).
- threshold : float, optional
- The q-value threshold for plotting. Default is 0.05.
- colors : list, optional
- A list of colors for the plots. If not provided, uses default colors.
- **kwargs : dict
- Additional plotting parameters.
-
- Returns
- -------
- None
- The function displays or saves the plot.
-
- Notes
- -----
- This function:
- 1. Creates a figure with two subplots for PSMs and peptides
- 2. Plots q-values for each result with different colors
- 3. Adds legends and titles to each subplot
- 4. Saves or displays the plot based on save_path
- """
- ifnotisinstance(results,list):
- results=[results]
-
- ifcolorsisNone:
- colors=[
- "#1f77b4",
- "#ff7f0e",
- "#2ca02c",
- "#d62728",
- "#9467bd",
- "#8c564b",
- "#e377c2",
- "#7f7f7f",
- "#bcbd22",
- "#17becf",
- ]
-
- fig,axs=plt.subplots(1,2,figsize=figsize,dpi=dpi)
-
- fori,resultinenumerate(results):
- forax,levelinzip(axs,["psms","peptides"]):
- result.plot_qvalues(
- level=level,
- c=colors[i%len(colors)],
- ax=ax,
- threshold=threshold,
- label=f"Result {i+1}"iflen(results)>1else"Results",
- linewidth=1,
- **kwargs,
- )
- ax.legend(frameon=False)
- ax.set_title(f"{level}")
-
- plt.tight_layout()
- returnsave_or_show_plot(save_path,logger)
-[docs]
-defvisualize_target_decoy_features(
- psms:PsmContainer,num_cols=5,save_path=None,**kwargs
-):
-"""
- Visualize the distribution of features in a DataFrame using kernel density estimation plots.
-
- Parameters
- ----------
- psms : PsmContainer
- A PsmContainer object containing the features to visualize.
- num_cols : int, optional
- The number of columns in the plot grid. Default is 5.
- save_path : str, optional
- The file path to save the plot. If not provided, the plot is displayed.
- **kwargs : dict
- Additional plotting parameters such as `figsize` and `dpi`, etc.
-
- Notes
- -----
- This function:
- 1. Extracts rescoring features from the PsmContainer
- 2. Filters out features with only one unique value
- 3. Creates a grid of plots showing the distribution of each feature
- 4. Separates target and decoy PSMs in each plot
- 5. Uses kernel density estimation to show the distribution shape
- """
- rescoring_features=[
- item
- forsublistinpsms.rescoring_features.values()
- foriteminsublist
- ifitem!=psms.hit_rank_column
- ]
-
- # drop features that only have one value
- rescoring_features=[
- feature
- forfeatureinrescoring_features
- iflen(psms.psms[feature].unique())>1
- ]
-
- num_features=len(rescoring_features)
- num_rows=(num_features+num_cols-1)//num_cols
-
- figsize=kwargs.get("figsize",(15,num_rows*15/num_cols))
- dpi=kwargs.get("dpi",300)
-
- fig,axes=plt.subplots(num_rows,num_cols,figsize=figsize,dpi=dpi)
- axes=axes.flatten()
-
- psms_top_hits=psms.psms[psms.psms[psms.hit_rank_column]==1].copy()
- num_true_hits=len(psms_top_hits[psms_top_hits[psms.label_column]==True])
- num_decoys=len(psms_top_hits[psms_top_hits[psms.label_column]==False])
- logger.debug(f"Number of true hits: {num_true_hits}")
- logger.debug(f"Number of decoys: {num_decoys}")
- psms_top_hits[psms.label_column]=psms_top_hits[psms.label_column].map(
- {True:"Target",False:"Decoy"}
- )
-
- fori,featureinenumerate(rescoring_features):
- try:
- ax=axes[i]
- sns.histplot(
- data=psms_top_hits,
- x=feature,
- hue=psms.label_column,
- ax=ax,
- bins="auto",
- common_bins=True,
- multiple="dodge",
- fill=True,
- alpha=0.3,
- stat="frequency",
- kde=True,
- linewidth=0,
- )
- ax.set_title(feature)
- ax.set_xlabel("")
- ax.set_ylabel("")
-
- exceptExceptionase:
- logger.error(f"Error plotting feature {feature}: {e}")
- ax.set_visible(False)
-
- forjinrange(i+1,len(axes)):
- fig.delaxes(axes[j])
-
- save_or_show_plot(save_path,logger)
Main pipeline class for optiMHC, encapsulating the full data processing workflow.
-
This class orchestrates input parsing, feature generation, rescoring, result saving, and visualization.
-It supports both single-run and experiment modes (multiple feature/model combinations).
-
-
Parameters:
-
config (str, dict, or Config) – Path to YAML config, dict, or Config object.
Run experiments with different feature/model combinations using multiprocessing.
-
Each experiment is executed in its own process for complete resource isolation.
-The experiment configurations must be provided in the config under the ‘experiments’ key.
Create or update all loggers so that each logger has a StreamHandler and optionally a FileHandler.
-This ensures all log messages are displayed in the console and optionally saved to a file.
-
-
Parameters:
-
-
log_file (str, optional) – Path to the log file. If None, no file logging is set up.
Print debugging information for all loggers that start with ‘optimhc’ and
-the root logger. This helps verify that logger configurations are set properly.
Merge new metadata into the PSM DataFrame based on specified columns.
-Metadata from the specified source is stored as a nested dictionary inside the metadata column.
-
-
Parameters:
-
-
metadata_df (pd.DataFrame) – DataFrame containing new metadata to add.
-
psms_key (str or list of str) – Column name(s) in the PSM data to merge on.
-
metadata_key (str or list of str) – Column name(s) in the metadata data to merge on.
-
source (str) – Name of the source of the new metadata.
Merge new features into the PSM DataFrame based on specified columns.
-
This method performs a left join between the PSM data and feature data,
-ensuring that all PSMs are preserved while adding new features. It handles
-column name conflicts through optional suffixing and maintains feature source
-tracking.
-
-
Parameters:
-
-
features_df (pd.DataFrame) – DataFrame containing new features to add.
-
psms_key (str or list of str) – Column name(s) in the PSM data to merge on.
-
feature_key (str or list of str) – Column name(s) in the features data to merge on.
-
source (str) – Name of the source of the new features (e.g., ‘deeplc’, ‘netmhc’).
-
suffix (str, optional) – Suffix to add to the new columns if there’s a name conflict.
-Required when new feature columns have the same names as existing columns.
-For example, if adding features from different sources (e.g., ‘score’ from
-DeepLC and NetMHC), use suffixes like ‘_deeplc’ or ‘_netmhc’ to distinguish them.
-
-
-
Return type:
-
None
-
-
Raises:
-
ValueError – If duplicate columns exist without suffix.
- If merging features changes the number of PSMs.
-
-
-
-
Notes
-
The method follows these steps:
-1. Validates input and prepares merge keys
-2. Checks for column name conflicts
-3. Manages feature source: if the source already exists, it will be overwritten
-4. Performs left join merge
-5. Verifies data integrity
-
The suffix parameter is used to handle column name conflicts:
-- When adding features from different sources that might have the same column names
-- When you want to keep both the original and new features with the same name
-- When you need to track the source of features in the column names
-
If suffix is not provided and there are duplicate column names:
-- The method will raise a ValueError
-- You must either provide a suffix or rename the columns before adding
-
-
-
Examples
-
>>> container=PsmContainer(...)
->>> # Adding features without suffix (no conflicts)
->>> features_df1=pd.DataFrame({
-... 'scan':[1,2,3],
-... 'feature1':[0.1,0.2,0.3],
-... 'feature2':[0.4,0.5,0.6]
-... })
->>> container.add_features(
-... features_df1,
-... psms_key='scan',
-... feature_key='scan',
-... source='source1'
-... )
->>> # Adding features with suffix (handling conflicts)
->>> features_df2=pd.DataFrame({
-... 'scan':[1,2,3],
-... 'score':[0.8,0.9,0.7],# This would conflict with existing 'score'
-... 'feature3':[0.7,0.8,0.9]
-... })
->>> container.add_features(
-... features_df2,
-... psms_key='scan',
-... feature_key='scan',
-... source='source2',
-... suffix='_new'# 'score' becomes 'score_new'
-... )
-
Convert a Position Frequency Matrix (PFM) file to a Position Weight Matrix (PWM).
-
-
Parameters:
-
-
pfm_filename (str) – The file path to the PFM file.
-
pseudocount (float, optional) – The pseudocount to add to the PFM to avoid zero probabilities. Default is 0.8.
-
background_freqs (dict, optional) – Dictionary containing the background frequencies for each amino acid.
-If None, uses 1/20 for all.
-
-
-
Returns:
-
DataFrame representation of the PWM.
-
-
Return type:
-
pd.DataFrame
-
-
-
-
Notes
-
The conversion process involves:
-1. Adding pseudocounts to the PFM
-2. Converting to Position Probability Matrix (PPM)
-3. Converting to PWM using log2(PPM/background_freqs)
Remove modifications from a peptide sequence, with an option to keep specific modifications.
-
-
Parameters:
-
-
peptide (str) – The peptide sequence with modifications in brackets.
-Example: ‘AANDAGYFNDEM[15.9949]APIEVK[42.0106]TK’
-
keep_modification (str or list, optional) – The modification(s) to keep. If provided, only these modifications will be
-preserved in the output sequence. Default is None.
-
-
-
Returns:
-
The peptide sequence with modifications removed or kept.
-Example: ‘AANDAGYFNDEMAPIEVKTK’ (if keep_modification is None)
-Example: ‘AANDAGYFNDEM[15.9949]APIEVKTK’ (if keep_modification=[‘15.9949’])
-
-
Return type:
-
str
-
-
-
-
Notes
-
Modifications are specified in square brackets after the amino acid.
-If keep_modification is provided, only those specific modifications will be
-preserved in the output sequence.
Preprocess the peptide sequence by removing flanking regions and modifications.
-
-
Parameters:
-
peptide (str) – Original peptide sequence with possible flanking regions and modifications.
-Example: ‘.AANDAGYFNDEM[15.9949]APIEVK[42.0106]TK.’
-
-
Returns:
-
Cleaned peptide sequence without flanking regions and modifications.
-Example: ‘AANDAGYFNDEMAPIEVKTK’
-
-
Return type:
-
str
-
-
-
-
Notes
-
This function performs two operations in sequence:
-1. Removes flanking amino acids using remove_pre_and_nxt_aa
-2. Removes all modifications using remove_modifications
Retrieve all files in the specified directory and return a list of file paths.
-
-
Parameters:
-
directory_path (str) – The path to the directory to search in.
-Example: ‘/path/to/directory’
-
-
Returns:
-
List of absolute file paths found in the directory and its subdirectories.
-Example: [‘/path/to/directory/file1.txt’, ‘/path/to/directory/subdir/file2.txt’]
-
-
Return type:
-
list of str
-
-
-
-
Notes
-
This function recursively searches through all subdirectories and returns
-absolute paths for all files found.
Convert a modified peptide sequence into Unimod format.
-
-
Parameters:
-
-
peptide (str) – The input peptide sequence with modifications in brackets.
-Example: ‘AANDAGYFNDEM[15.9949]APIEVK[42.0106]TK’
-
mod_dict (dict) – Dictionary mapping modification names (in peptide) to corresponding Unimod names.
-Example: {‘15.9949’: ‘UNIMOD:4’, ‘42.0106’: ‘UNIMOD:1’}
-
-
-
Returns:
-
The peptide sequence formatted for Unimod.
-Example: ‘AANDAGYFNDEM[UNIMOD:4]APIEVK[UNIMOD:1]TK’
-
-
Return type:
-
str
-
-
-
-
Notes
-
This function replaces the modification names in brackets with their
-corresponding Unimod identifiers while preserving the peptide sequence
-structure.
This function:
-1. Reads PIN file(s) into a DataFrame
-2. Identifies required columns (case-insensitive)
-3. Processes scan IDs and hit ranks
-4. Converts data types appropriately
-5. Creates a PsmContainer with the processed data
ValueError – If the PepXML files were generated by Percolator or PeptideProphet.
-
-
-
-
Notes
-
This function:
-1. Reads and parses PepXML files
-2. Calculates mass difference features
-3. Processes matched ions and complementary ions
-4. Creates charge columns
-5. Log-transforms p-values
-6. Returns a PsmContainer with the processed data
scan_ids (list[int] or None, optional) – A list of scan IDs to extract. If None, extracts all scans.
-
-
-
Returns:
-
A DataFrame containing the extracted scan data with columns:
-- source: The source file name
-- scan: The scan ID
-- mz: The m/z values array
-- intensity: The intensity values array
-- charge: The charge state
-- retention_time: The retention time
-
-
Return type:
-
pd.DataFrame
-
-
-
-
Notes
-
This function:
-1. Reads the mzML file using pyteomics
-2. Extracts scan data including retention time, charge state, m/z values, and intensities
-3. Filters scans based on provided scan IDs if specified
-4. Returns a DataFrame with the extracted data
This function:
-1. Reads PIN file(s) into a DataFrame
-2. Identifies required columns (case-insensitive)
-3. Processes scan IDs and hit ranks
-4. Converts data types appropriately
-5. Creates a PsmContainer with the processed data
ValueError – If the PepXML files were generated by Percolator or PeptideProphet.
-
-
-
-
Notes
-
This function:
-1. Reads and parses PepXML files
-2. Calculates mass difference features
-3. Processes matched ions and complementary ions
-4. Creates charge columns
-5. Log-transforms p-values
-6. Returns a PsmContainer with the processed data
msms_run (tuple of anything, lxml.etree.Element) – The second element of the tuple should be the XML element for a single
-msms_run. The first is not used, but is necessary for compatibility
-with using map().
-
decoy_prefix (str) – The prefix used to indicate a decoy protein in the description lines
-of the FASTA file.
-
-
-
Yields:
-
dict – A dictionary describing all of the PSMs in a run.
Log-transform columns that are p-values or E-values.
-
-
Parameters:
-
-
col (pandas.Series) – A column of the dataset.
-
features (list of str) – The features of the dataset. Only feature columns will be considered
-for transformation.
-
-
-
Returns:
-
The log-transformed values of the column if the feature was determined
-to be a p-value.
-
-
Return type:
-
pandas.Series
-
-
-
-
Notes
-
This function:
-1. Detects columns written in scientific notation and log them
-2. Uses a simple heuristic to find p-value / E-value features
-3. Only transforms if values span >4 orders of magnitude
-4. Preserves precision for scientific notation values
scan_ids (list[int] or None, optional) – A list of scan IDs to extract. If None, extracts all scans.
-
-
-
Returns:
-
A DataFrame containing the extracted scan data with columns:
-- source: The source file name
-- scan: The scan ID
-- mz: The m/z values array
-- intensity: The intensity values array
-- charge: The charge state
-- retention_time: The retention time
-
-
Return type:
-
pd.DataFrame
-
-
-
-
Notes
-
This function:
-1. Reads the mzML file using pyteomics
-2. Extracts scan data including retention time, charge state, m/z values, and intensities
-3. Filters scans based on provided scan IDs if specified
-4. Returns a DataFrame with the extracted data
Feature generator that generates basic features from peptide sequences.
-
This generator calculates features such as peptide length, proportion of unique amino acids,
-Shannon entropy of amino acid distribution, difference between peptide length and average peptide length,
-and count of unique amino acids.
-
-
Parameters:
-
-
peptides (List[str]) – List of peptide sequences to generate features for.
-
remove_pre_nxt_aa (bool, optional) – Whether to remove the amino acids adjacent to the peptide.
-If True, removes them. Default is True.
-
remove_modification (bool, optional) – Whether to remove modifications in the peptide sequences.
-If True, removes them. Default is True.
-
-
-
-
-
Notes
-
The generated features include:
-- length_diff_from_avg: Difference between peptide length and average length
-- abs_length_diff_from_avg: Absolute difference between peptide length and average length
-- unique_aa_count: Number of unique amino acids in the peptide
-- unique_aa_proportion: Proportion of unique amino acids in the peptide
-- shannon_entropy: Shannon entropy of amino acid distribution
Generate basic features for the provided peptides.
-
-
Returns:
-
DataFrame containing peptides and their computed features:
-- length_diff_from_avg: Difference from average peptide length
-- abs_length_diff_from_avg: Absolute difference from average length
-- unique_aa_count: Number of unique amino acids
-- unique_aa_proportion: Proportion of unique amino acids
-- shannon_entropy: Shannon entropy of amino acid distribution
-
-
Return type:
-
pd.DataFrame
-
-
Raises:
-
ValueError – If NaN values are found in the generated features.
-
-
-
-
Notes
-
All features are converted to float type before returning.
-The method calculates average peptide length across all peptides
-and uses it as a reference for length-based features.
This generator uses DeepLC to predict retention times and calculates various
-features based on the differences between predicted and observed retention times.
-
-
Parameters:
-
-
psms (PsmContainer) – PSMs to generate features for.
-
calibration_criteria_column (str) – Column name in the PSMs DataFrame to use for DeepLC calibration.
-
lower_score_is_better (bool, optional) – Whether a lower PSM score denotes a better matching PSM. Default is False.
-
calibration_set_size (int or float, optional) – Amount of best PSMs to use for DeepLC calibration. If this value is lower
-than the number of available PSMs, all PSMs will be used. Default is 0.15.
-
processes (int, optional) – Number of processes to use in DeepLC. Default is 1.
-
model_path (str, optional) – Path to the DeepLC model. If None, the default model will be used.
-
remove_pre_nxt_aa (bool, optional) – Whether to remove the first and last amino acids from the peptide sequence.
-Default is True.
-
mod_dict (dict, optional) – Dictionary of modifications to be used for DeepLC. If None, no modifications
-will be used.
-
-
-
-
-
Notes
-
DeepLC retraining is on by default. Add deeplc_retrain:False as a keyword
-argument to disable retraining.
-
The generated features include:
-- observed_retention_time: Original retention time from the data
-- predicted_retention_time: DeepLC predicted retention time
-- retention_time_diff: Difference between predicted and observed times
-- abs_retention_time_diff: Absolute difference between predicted and observed times
-- retention_time_ratio: Ratio of min(pred,obs) to max(pred,obs)
Return the list of input columns required for the feature generator.
-
-
Returns:
-
List of input columns required for feature generation.
-Currently returns an empty list as the required columns are
-handled internally by the PsmContainer.
Extract the format required by DeepLC, while retaining necessary original information.
-
-
Returns:
-
DataFrame with the required DeepLC format and original information:
-- original_seq: Original peptide sequence
-- label: Target/decoy label
-- seq: Cleaned peptide sequence
-- modifications: Unimod format modifications
-- tr: Retention time
-- score: Calibration criteria score
-
-
Return type:
-
pd.DataFrame
-
-
Raises:
-
ValueError – If retention time column is not found in the PSMs DataFrame.
-
-
-
-
Notes
-
This method prepares the data in the format required by DeepLC,
-including cleaning peptide sequences and converting modifications
-to Unimod format.
DataFrame containing the PSMs with added DeepLC features:
-- original_seq: Original peptide sequence
-- observed_retention_time: Original retention time
-- predicted_retention_time: DeepLC predicted retention time
-- retention_time_diff: Difference between predicted and observed times
-- abs_retention_time_diff: Absolute difference between predicted and observed times
-- retention_time_ratio: Ratio of min(pred,obs) to max(pred,obs)
-
-
Return type:
-
pd.DataFrame
-
-
-
-
Notes
-
This method:
-1. Prepares data in DeepLC format
-2. Calibrates DeepLC if calibration set is specified
-3. Predicts retention times
-4. Calculates various retention time-based features
-5. Handles missing values by imputing with median values
DataFrame containing the DeepLC input data with all columns:
-- original_seq: Original peptide sequence
-- label: Target/decoy label
-- seq: Cleaned peptide sequence
-- modifications: Unimod format modifications
-- tr: Retention time
-- score: Calibration criteria score
-- predicted_retention_time: DeepLC predicted retention time
-- retention_time_diff: Difference between predicted and observed times
DataFrame containing the raw predictions:
-- peptide: Cleaned peptide sequence
-- predicted_rt: DeepLC predicted retention time
-- observed_rt: Original retention time
-- modifications: Unimod format modifications
-
-
Return type:
-
pd.DataFrame
-
-
-
-
Notes
-
If predictions haven’t been generated yet, this will trigger
-feature generation automatically.
DataFrame containing the raw predictions:
-- peptide: Cleaned peptide sequence
-- predicted_rt: DeepLC predicted retention time
-- observed_rt: Original retention time
-- modifications: Unimod format modifications
-
-
Return type:
-
pd.DataFrame
-
-
-
-
Notes
-
This is a convenience method that returns the same data as the
-raw_predictions property.
**kwargs (dict) – Additional parameters passed to pandas.DataFrame.to_csv.
-If ‘index’ is not specified, it defaults to False.
-
-
-
Return type:
-
None
-
-
-
-
Notes
-
This method saves the raw predictions DataFrame to a CSV file.
-The DataFrame includes:
-- peptide: Cleaned peptide sequence
-- predicted_rt: DeepLC predicted retention time
-- observed_rt: Original retention time
-- modifications: Unimod format modifications
Generate MHCflurry features for the provided peptides and alleles.
-
-
Returns:
-
DataFrame containing the peptides and their predicted MHCflurry features:
-- Peptide: Original peptide sequence
-- mhcflurry_affinity: Binding affinity
-- mhcflurry_processing_score: Processing score
-- mhcflurry_presentation_score: Presentation score
-- mhcflurry_presentation_percentile: Presentation percentile
-
-
Return type:
-
pd.DataFrame
-
-
-
-
Notes
-
This method:
-1. Runs MHCflurry predictions
-2. Renames columns to include ‘mhcflurry_’ prefix
-3. Fills missing values with median values
-4. Returns the final feature DataFrame
Generate NetMHCpan features for peptides based on specified MHC class I alleles.
-
This generator calculates NetMHCpan binding predictions for each peptide against
-the provided MHC class I alleles.
-
-
Parameters:
-
-
peptides (List[str]) – List of peptide sequences.
-
alleles (List[str]) – List of MHC allele names (e.g., [‘HLA-A*02:01’, ‘HLA-B*07:02’]).
-
mode (str, optional) – Mode of feature generation. Options:
-- ‘best’: Return only the best allele information for each peptide.
-- ‘all’: Return predictions for all alleles with allele-specific suffixes plus best allele info.
-Default is ‘best’.
-
remove_pre_nxt_aa (bool, optional) – Whether to include the previous and next amino acids in peptides.
-If True, remove them. Default is True.
-
remove_modification (bool, optional) – Whether to include modifications in peptides.
-If True, remove them. Default is True.
-
n_processes (int, optional) – Number of processes to use for multiprocessing.
-Default is 1 (no multiprocessing).
-
show_progress (bool, optional) – Whether to display a progress bar. Default is False.
-
-
-
-
-
Notes
-
The generated features include:
-- netmhcpan_score: Raw binding score
-- netmhcpan_affinity: Binding affinity in nM
-- netmhcpan_percentile_rank: Percentile rank of the binding score
Return the list of generated feature column names, determined by the mode.
-Only includes numerical features, excluding any string features like allele names.
-
-
Returns:
-
List of feature column names:
-- For ‘all’ mode: netmhcpan_score_{allele}, netmhcpan_affinity_{allele},
-
-
netmhcpan_percentile_rank_{allele} for each allele
-
-
-
For both modes: netmhcpan_best_score, netmhcpan_best_affinity,
-netmhcpan_best_percentile_rank
DataFrame containing the prediction results:
-- Peptide: Original peptide sequence
-- peptide: Cleaned peptide sequence
-- allele: MHC allele
-- score: Raw binding score
-- affinity: Binding affinity in nM
-- percentile_rank: Percentile rank
-
-
Return type:
-
pd.DataFrame
-
-
-
-
Notes
-
This method:
-1. Preprocesses peptides
-2. Filters peptides by length (8-30 amino acids)
-3. Runs predictions (with or without multiprocessing)
-4. Merges results with original peptides
Generate the final feature table with NetMHCpan features for each peptide.
-
-
Returns:
-
DataFrame containing peptides and their predicted features:
-- Peptide: Original peptide sequence
-- For ‘all’ mode: netmhcpan_score_{allele}, netmhcpan_affinity_{allele},
-
-
netmhcpan_percentile_rank_{allele} for each allele
-
-
-
For both modes: netmhcpan_best_score, netmhcpan_best_affinity,
-netmhcpan_best_percentile_rank
-
-
-
-
Return type:
-
pd.DataFrame
-
-
-
-
Notes
-
The features generated depend on the mode:
-- ‘best’: Only the best allele information for each peptide
-- ‘all’: All allele predictions plus best allele information
-
Missing values are handled consistently by filling with median values
-for numeric columns.
predictions_df (pd.DataFrame) – The predictions DataFrame.
-
features_df (pd.DataFrame) – The features DataFrame to update.
-
-
-
Returns:
-
Updated features DataFrame with all allele features:
-- Peptide: Original peptide sequence
-- netmhcpan_score_{allele}: Raw binding score for each allele
-- netmhcpan_affinity_{allele}: Binding affinity for each allele
-- netmhcpan_percentile_rank_{allele}: Percentile rank for each allele
predictions_df (pd.DataFrame) – The predictions DataFrame.
-
features_df (pd.DataFrame) – The features DataFrame to update.
-
-
-
Returns:
-
Updated features DataFrame with best allele features:
-- Peptide: Original peptide sequence
-- netmhcpan_best_allele: Best binding allele
-- netmhcpan_best_score: Best binding score
-- netmhcpan_best_affinity: Best binding affinity
-- netmhcpan_best_percentile_rank: Best percentile rank
-
-
Return type:
-
pd.DataFrame
-
-
-
-
Notes
-
The best allele is determined by the lowest percentile rank.
features_df (pd.DataFrame) – The features DataFrame to update.
-
-
Returns:
-
Updated features DataFrame with filled missing values.
-
-
Return type:
-
pd.DataFrame
-
-
-
-
Notes
-
This method:
-1. Fills best allele string values with ‘Unknown’
-2. Fills numeric values with median for all allele features
-3. Fills numeric values for best allele features with median
Use NetMHCIIpan43_BA to predict a batch of peptides (MHC Class II).
-
-
Return type:
-
DataFrame
-
-
Parameters:
-
-
peptides_chunk (List[str])
-
alleles (List[str])
-
-
-
-
-
Parameters:
peptides_chunk (List[str]): A batch of peptide sequences to predict.
-alleles (List[str]): List of MHC Class II alleles, e.g., [‘DRB1_0101’, ‘DRB1_0102’].
-
-
Returns:
pd.DataFrame: A DataFrame containing prediction results.
Generate NetMHCIIpan features for given peptides based on specified MHC Class II alleles.
-
This feature generator uses the NetMHCIIpan43_BA interface to predict MHC Class II binding
-for each peptide and returns scores and features based on the specified parameters.
-
-
Parameters:
-
-
peptides (List[str]) – List of peptide sequences.
-
alleles (List[str]) – List of MHC Class II alleles, e.g., [‘DRB1_0101’, ‘DRB1_0102’].
-
mode (str, optional) – Feature generation mode. Options:
-- ‘best’: Return only the best result for each peptide across all alleles.
-- ‘all’: Return prediction results for each peptide across all alleles (with allele-specific column suffixes).
-Default is ‘best’.
-
remove_pre_nxt_aa (bool, optional) – Whether to remove the amino acids flanking the peptide (e.g., removing X-AA/AA-X forms).
-Default is True.
-
remove_modification (bool, optional) – Whether to remove modification information from peptides, e.g., (Phospho).
-Default is True.
-
n_processes (int, optional) – Number of processes to use. Default is 1 (no multiprocessing).
-
show_progress (bool, optional) – Whether to display a progress bar. Default is False.
-
-
-
-
-
Notes
-
The generated features include:
-- netmhciipan_score: Raw binding score
-- netmhciipan_affinity: Binding affinity in nM
-- netmhciipan_percentile_rank: Percentile rank of the binding score
Return the list of generated feature column names, determined by the mode.
-Only includes numerical features, excluding any string features like allele names.
-
-
Returns:
-
List of feature column names:
-- For ‘all’ mode: netmhciipan_score_{allele}, netmhciipan_affinity_{allele},
-
-
netmhciipan_percentile_rank_{allele} for each allele
-
-
-
For both modes: netmhciipan_best_score, netmhciipan_best_affinity,
-netmhciipan_best_percentile_rank
DataFrame containing the prediction results:
-- Peptide: Original peptide sequence
-- peptide: Cleaned peptide sequence
-- allele: MHC allele
-- score: Raw binding score
-- affinity: Binding affinity in nM
-- percentile_rank: Percentile rank
-
-
Return type:
-
pd.DataFrame
-
-
-
-
Notes
-
This method:
-1. Preprocesses peptides
-2. Filters peptides by length (9-30 amino acids)
-3. Runs predictions (with or without multiprocessing)
-4. Merges results with original peptides
Generate the final feature table with NetMHCIIpan features for each peptide.
-
-
Returns:
-
DataFrame containing peptides and their predicted features:
-- Peptide: Original peptide sequence
-- For ‘all’ mode: netmhciipan_score_{allele}, netmhciipan_affinity_{allele},
-
-
netmhciipan_percentile_rank_{allele} for each allele
-
-
-
For both modes: netmhciipan_best_score, netmhciipan_best_affinity,
-netmhciipan_best_percentile_rank
-
-
-
-
Return type:
-
pd.DataFrame
-
-
-
-
Notes
-
The features generated depend on the mode:
-- ‘best’: Only the best allele information for each peptide
-- ‘all’: All allele predictions plus best allele information
-
Missing values are handled consistently by filling with median values
-for numeric columns.
predictions_df (pd.DataFrame) – The predictions DataFrame.
-
features_df (pd.DataFrame) – The features DataFrame to update.
-
-
-
Returns:
-
Updated features DataFrame with all allele features:
-- Peptide: Original peptide sequence
-- netmhciipan_score_{allele}: Raw binding score for each allele
-- netmhciipan_affinity_{allele}: Binding affinity for each allele
-- netmhciipan_percentile_rank_{allele}: Percentile rank for each allele
predictions_df (pd.DataFrame) – The predictions DataFrame.
-
features_df (pd.DataFrame) – The features DataFrame to update.
-
-
-
Returns:
-
Updated features DataFrame with best allele features:
-- Peptide: Original peptide sequence
-- netmhciipan_best_allele: Best binding allele
-- netmhciipan_best_score: Best binding score
-- netmhciipan_best_affinity: Best binding affinity
-- netmhciipan_best_percentile_rank: Best percentile rank
-
-
Return type:
-
pd.DataFrame
-
-
-
-
Notes
-
The best allele is determined by the lowest percentile rank.
features_df (pd.DataFrame) – The features DataFrame to update.
-
-
Returns:
-
Updated features DataFrame with filled missing values.
-
-
Return type:
-
pd.DataFrame
-
-
-
-
Notes
-
This method:
-1. Fills best allele string values with ‘Unknown’
-2. Fills numeric values with median for all allele features
-3. Fills numeric values for best allele features with median
Generates features based on peptide sequence overlaps using the Overlap-Layout-Consensus (OLC) algorithm.
-
This generator constructs an overlap graph of peptides, removes transitive edges, simplifies the graph to contigs,
-and computes features such as the number of overlaps, log-transformed overlap counts, overlap ranks, and contig lengths.
-It also filters out peptides with low entropy or outlier lengths before processing.
-Additionally, it records detailed information about brother peptides and contigs, accessible via the get_all_data method.
-
-
Parameters:
-
-
peptides (list of str) – List of peptide sequences.
-
min_overlap_length (int, optional) – Minimum required overlap length for peptides to be considered overlapping. Default is 6.
-
min_length (int, optional) – Minimum peptide length to include in processing. Default is 7.
-
max_length (int, optional) – Maximum peptide length to include in processing. Default is 60.
-
min_entropy (float, optional) – Minimum Shannon entropy for peptides to include in processing. Default is 0.
-
fill_missing (str, optional) – Method to fill missing values for filtered peptides. Options are ‘median’ or ‘zero’. Default is ‘median’.
-
remove_pre_nxt_aa (bool, optional) – Whether to remove the preceding and following amino acids from peptides. Default is False.
-
remove_modification (bool, optional) – Whether to remove modifications from peptides. Default is True.
-
-
-
Variables:
-
-
original_peptides (list of str) – Original list of peptide sequences.
peptide_to_contig (dict of str to int) – Mapping of peptides to their contig indices.
-
assembled_contigs (list of dict) – List of assembled contigs.
-
full_data (pd.DataFrame) – Full data including brother peptides and contig information.
-
_overlap_graph (nx.DiGraph) – Overlap graph.
-
_simplified_graph (nx.DiGraph) – Simplified graph with transitive edges removed.
-
-
-
-
-
Notes
-
-
Key Data Structures:
-
contigs: List[List[str]]
-- Represents non-branching paths in the overlap graph
-- Each inner list contains peptide sequences that form a continuous chain
-- Example: [[‘PEPTIDE1’, ‘PEPTIDE2’], [‘PEPTIDE3’]]
-
assembled_contigs: List[Dict]
-- Contains the assembled sequences and their constituent peptides
-- Each dictionary has two keys:
-
-
‘sequence’: The merged/assembled sequence of overlapping peptides
-‘peptides’: List of peptides that were used to build this contig
peptide_to_contig: Dict[str, int]
-- Maps each peptide to its contig index in assembled_contigs
-- Key: peptide sequence
-- Value: index of the contig containing this peptide
-- Example: {
overlap_graph (_overlap_graph): nx.DiGraph
-- Directed graph representing all possible overlaps between peptides
-- Nodes: peptide sequences
-- Edges: overlaps between peptides
-- Edge weights: length of overlap
-
simplified_graph (_simplified_graph): nx.DiGraph
-- Simplified version of overlap_graph with transitive edges removed
-- Used for final contig assembly
-- More efficient representation of essential overlaps
Remove peptides that are fully contained in other peptides.
-
-
Return type:
-
Tuple[List[str], Dict[str, str]]
-
-
Parameters:
-
peptides (List[str])
-
-
-
-
Parameters:
peptides (List[str]): List of peptide sequences.
-
-
Returns:
accepted_peptides (List[str]): List of accepted peptides not redundant (largest container peptides).
-redundant_mapping (Dict[str, str]): Mapping of redundant peptide to its largest container peptide.
Calculate overlap and contig related features for peptides.
-
This method computes the overlap graph, simplifies it to contigs, assembles contigs,
-and generates features for each peptide. It also handles redundant peptides by mapping them
-to their container peptides.
-
-
Return type:
-
DataFrame
-
-
Parameters:
-
peptides (List[str])
-
-
-
-
Parameters:
peptides (List[str]): List of peptide sequences.
-
-
Returns:
pd.DataFrame: DataFrame containing the computed features.
Compute the features for each peptide, ensuring output aligns with input.
-
This method preprocesses and filters peptides, calculates overlap and contig features,
-maps them back to the original peptides, and fills missing values.
-
-
Return type:
-
DataFrame
-
-
-
-
Returns:
pd.DataFrame: DataFrame containing features for each peptide.
Returns the full data including brother peptides and contig information for each peptide.
-In the output, the lists of contig peptides and brother peptides include redundant peptides,
-so that their counts match the corresponding peptide and contig_member_count.
-
-
Return type:
-
DataFrame
-
-
-
-
Returns:
pd.DataFrame: DataFrame containing peptides and their brother peptides and contigs.
Assign aggregated features based on brother peptides to the PSMs.
-
For PSMs with the same ContigSequence (brother peptides), compute the mean of specified features
-and assign these aggregated features back to each PSM in the group. Additionally, compute
-the sum as mean * (contig_member_count + 1). If a PSM does not have a ContigSequence (no brothers),
-its new features will be set to the original values.
psms (PsmContainer): PSM container containing the peptides and features.
-feature_columns (Union[str, List[str]]): Name of the feature column(s) to aggregate.
-overlapping_source (str): Source name of the overlapping peptide features.
-source_name (str): Name of the new feature source.
peptides (List[str]): Series of peptide sequences.
-alleles (List[str]): List of MHC allele names (e.g., [‘HLA-A01:01’, ‘HLA-B07:02’]).
-mhc_class (str): MHC class, either ‘I’ or ‘II’. Default is ‘I’.
-pwm_path (Optional[Union[str, os.PathLike]]): Custom path to PWM files. Defaults to ‘../../data/PWMs’.
-remove_pre_nxt_aa (bool): Whether to include the previous and next amino acids in peptides.
-
-
If True, remove them. Default is False.
-
-
-
remove_modification (bool): Whether to include modifications in peptides.
Calculate PWM scores for MHC class II using a sliding 9-mer window.
-
-
Parameters:
-
-
peptide (str) – The peptide sequence to score.
-
allele (str) – The MHC allele to score against.
-
-
-
Returns:
-
A tuple containing:
-- core_score : float or None
-
-
Score for the best 9-mer core.
-
-
-
-
n_flank_scorefloat or None
Score for the N-terminal flanking region.
-
-
-
-
-
c_flank_scorefloat or None
Score for the C-terminal flanking region.
-
-
-
-
-
Returns (None, None, None) if peptide has length < 9.
-
-
-
Return type:
-
tuple of (float or None, float or None, float or None)
-
-
-
-
Notes
-
The method:
-1. Slides over all possible 9-mer windows to find the highest core PWM score.
-2. Once the best core is found, extracts up to 3 AA on each flank (N-flank and C-flank).
-3. If the flank has fewer than 3 residues, pads with ‘X’.
-4. Scores each flank with n_flank_pwm and c_flank_pwm.
Feature generator for calculating similarity between experimental and predicted spectra.
-
This class works through the following steps:
-1. Extract experimental spectral data from mzML files
-2. Use Koina for theoretical spectra prediction
-3. Align experimental and predicted spectra
-4. Calculate similarity metrics as features
-
-
Parameters:
peptides (List[str]): List of peptide sequences
-charges (List[int]): List of charge states
-scan_ids (List[int]): List of scan IDs
-mz_file_paths (List[str]): List of mzML file paths
-model_type (str): Prediction model type, either “HCD” or “CID”
-collision_energies (List[float]): List of collision energies, required when model_type is “HCD”
-remove_pre_nxt_aa (bool): Whether to remove preceding and next amino acids, default is True
-remove_modification (bool): Whether to remove modifications, default is True
-url (str): Koina server URL, default is “koina.wilhelmlab.org:443”
-top_n (int): Number of top peaks to use for alignment, default is 12
-tolerance_ppm (float): Mass tolerance for alignment in ppm, default is 20
As Prosit does not support non-standard amino acid ‘U’, we replace it with ‘C’.
-
-
Parameters:
-
peptide (str) – Original peptide sequence.
-
-
Returns:
-
Processed peptide sequence.
-
-
Return type:
-
str
-
-
-
-
Notes
-
This is nonsense when it comes to spectral prediction, but we need to keep it
-for compatibility with Koina. In the future, this should be prohibited at the input level.
Extract experimental spectral data from mzML files.
-
-
Returns:
-
DataFrame containing experimental spectral data.
-
-
Return type:
-
pd.DataFrame
-
-
-
-
Notes
-
The method groups scan IDs by file path for efficiency and extracts
-spectral data from each mzML file. The resulting DataFrame contains
-m/z values, intensities, and associated metadata for each spectrum.
processed_peptides (list of str) – List of preprocessed peptide sequences.
-
charges (list of int) – List of charge states.
-
-
-
Returns:
-
DataFrame containing predicted spectral data.
-
-
Return type:
-
pd.DataFrame
-
-
Raises:
-
Exception – If there is an error during Koina prediction.
-
-
-
-
Notes
-
The method uses Koina to predict theoretical spectra for each peptide.
-For AlphaPeptDeep_ms2_generic model, inputs are split into batches
-grouped by peptide length for prediction.
Extract top N peaks based on predicted intensity for similarity calculation
-
-
Return type:
-
Tuple[ndarray, ndarray, List[Tuple]]
-
-
Parameters:
-
-
aligned_exp_intensity (ndarray)
-
aligned_pred_intensity (ndarray)
-
matched_indices (List[Tuple])
-
top_n (int)
-
-
-
-
-
Parameters:
aligned_exp_intensity (np.ndarray): Aligned experimental intensity vector
-aligned_pred_intensity (np.ndarray): Aligned predicted intensity vector
-matched_indices (List[Tuple]): Matching index pairs (pred_idx, exp_idx)
-top_n (int): Number of top peaks to extract
This is a wrapper around _align_spectra_all_peaks and _get_top_peaks_vectors
-to maintain backward compatibility while using the improved algorithm.
-
-
Parameters:
-
-
exp_mz (list of float) – Experimental m/z values.
-
exp_intensity (list of float) – Experimental intensity values.
-
pred_mz (list of float) – Predicted m/z values.
-
pred_intensity (list of float) – Predicted intensity values.
-
pred_annotation (list of str, optional) – Predicted fragment annotations.
-
-
-
Returns:
-
-
Aligned experimental intensity vector
-
Predicted intensity vector
-
Matching index pairs (for top N peaks)
-
-
-
-
Return type:
-
tuple of (np.ndarray, np.ndarray, list of tuple)
-
-
-
-
Notes
-
The method first aligns all peaks using _align_spectra_all_peaks, then
-extracts the top N peaks using _get_top_peaks_vectors for compatibility
-with existing code.
Based on the method described in https://www.nature.com/articles/s41592-021-01331-z.
-The spectral entropy is calculated using the formula:
-1 - (2*S_PM - S_P - S_M)/ln(4), where S_PM is the entropy of the mixed
-distribution, and S_P and S_M are the entropies of the individual distributions.
Dictionary of similarity features, including:
-- spectral_angle_similarity: Spectral angle similarity (0-1)
-- cosine_similarity: Cosine similarity (0-1)
-- pearson_correlation: Pearson correlation (-1 to 1)
-- spearman_correlation: Spearman correlation (-1 to 1)
-- mean_squared_error: Mean squared error
-- unweighted_entropy_similarity: Spectral entropy similarity
-- predicted_seen_nonzero: Number of predicted peaks seen in experimental spectrum
-- predicted_not_seen: Number of predicted peaks not seen in experimental spectrum
-- bray_curtis_similarity: Bray-Curtis similarity (0-1)
Unified function to plot average feature importance across multiple models.
-
-
This function supports:
-
Linear models (e.g., Linear SVR) which provide an ‘estimator’ attribute with a ‘coef_’.
-The absolute value of the coefficients is used for importance, and hatch patterns are applied
-to differentiate between positive and negative coefficients.
-
XGBoost models which provide a ‘feature_importances_’ attribute. Since these values are
-always positive, no hatch patterns are applied.
-
-
-
-
-
Parameters:
-
-
models (list) – A list of model objects.
-For linear models, each model should have an ‘estimator’ with ‘coef_’.
-For XGBoost models, each model should have a ‘feature_importances_’ attribute.
-
rescoring_features (dict) – A dictionary where keys are sources and values are lists of features.
-
save_path (str, optional) – If provided, saves the plot to the specified path.
-
sort (bool, optional) – If True, sorts the features by their importance in descending order.
-Default is False.
-
error (bool, optional) – If True, adds error bars to the plot. Default is False.
-
**kwargs (dict) – Additional plotting parameters such as ‘figsize’ and ‘dpi’.
-
-
-
-
-
Notes
-
The function automatically detects the model type based on the presence of the corresponding attribute.
-For linear models, it uses hatch patterns to differentiate between positive and negative coefficients.
-For XGBoost models, it uses solid bars since the importances are always positive.
Visualize the distribution of features in a DataFrame using kernel density estimation plots.
-
-
Parameters:
-
-
psms (PsmContainer) – A PsmContainer object containing the features to visualize.
-
num_cols (int, optional) – The number of columns in the plot grid. Default is 5.
-
save_path (str, optional) – The file path to save the plot. If not provided, the plot is displayed.
-
**kwargs (dict) – Additional plotting parameters such as figsize and dpi, etc.
-
-
-
-
-
Notes
-
This function:
-1. Extracts rescoring features from the PsmContainer
-2. Filters out features with only one unique value
-3. Creates a grid of plots showing the distribution of each feature
-4. Separates target and decoy PSMs in each plot
-5. Uses kernel density estimation to show the distribution shape
Visualize the correlation between features in a DataFrame using a heatmap.
-
-
Parameters:
-
-
psms (PsmContainer) – A PsmContainer object containing the features to visualize.
-
save_path (str, optional) – The file path to save the plot. If not provided, the plot is displayed.
-
**kwargs (dict) – Additional plotting parameters such as figsize and dpi, etc.
-
-
-
-
-
Notes
-
This function:
-1. Extracts all rescoring features from the PsmContainer
-2. Calculates the correlation matrix between features
-3. Creates a heatmap visualization of the correlations
-4. Uses a coolwarm colormap to show positive and negative correlations
results (object or list) – A list of results objects or a single result object.
-Each result object should have a method plot_qvalues.
-
save_path (str, optional) – If provided, saves the plot to the specified path.
-
dpi (int, optional) – The resolution of the plot. Default is 300.
-
figsize (tuple, optional) – The size of the figure. Default is (15, 10).
-
threshold (float, optional) – The q-value threshold for plotting. Default is 0.05.
-
colors (list, optional) – A list of colors for the plots. If not provided, uses default colors.
-
**kwargs (dict) – Additional plotting parameters.
-
-
-
Returns:
-
The function displays or saves the plot.
-
-
Return type:
-
None
-
-
-
-
Notes
-
This function:
-1. Creates a figure with two subplots for PSMs and peptides
-2. Plots q-values for each result with different colors
-3. Adds legends and titles to each subplot
-4. Saves or displays the plot based on save_path
-
-
-
-
\ No newline at end of file
diff --git a/docs/source/api.rst b/docs/source/api.rst
deleted file mode 100644
index d2c8a7b..0000000
--- a/docs/source/api.rst
+++ /dev/null
@@ -1,141 +0,0 @@
-API Reference
-============
-
-Core Modules
------------
-
-.. automodule:: optimhc.core
- :members:
- :undoc-members:
- :show-inheritance:
-
-Logging
--------
-
-.. automodule:: optimhc.core.logging_helper
- :members:
- :undoc-members:
- :show-inheritance:
-
-CLI Interface
-------------
-
-.. automodule:: optimhc.cli
- :members:
- :undoc-members:
- :show-inheritance:
-
-
-PSM Container
-------------
-
-.. automodule:: optimhc.psm_container
- :members:
- :undoc-members:
- :show-inheritance:
-
-Utilities
---------
-
-.. automodule:: optimhc.utils
- :members:
- :undoc-members:
- :show-inheritance:
-
-Parser Modules
--------------
-
-.. automodule:: optimhc.parser
- :members:
- :undoc-members:
- :show-inheritance:
-
-Parser Submodules
-~~~~~~~~~~~~~~~
-
-.. automodule:: optimhc.parser.pin
- :members:
- :undoc-members:
- :show-inheritance:
-
-.. automodule:: optimhc.parser.pepxml
- :members:
- :undoc-members:
- :show-inheritance:
-
-.. automodule:: optimhc.parser.mzml
- :members:
- :undoc-members:
- :show-inheritance:
-
-Feature Generator
----------------
-
-.. automodule:: optimhc.feature_generator
- :members:
- :undoc-members:
- :show-inheritance:
-
-Feature Generator Submodules
-~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. automodule:: optimhc.feature_generator.base_feature_generator
- :members:
- :undoc-members:
- :show-inheritance:
-
-.. automodule:: optimhc.feature_generator.basic
- :members:
- :undoc-members:
- :show-inheritance:
-
-.. automodule:: optimhc.feature_generator.DeepLC
- :members:
- :undoc-members:
- :show-inheritance:
-
-.. automodule:: optimhc.feature_generator.mhcflurry
- :members:
- :undoc-members:
- :show-inheritance:
-
-.. automodule:: optimhc.feature_generator.netMHCpan
- :members:
- :undoc-members:
- :show-inheritance:
-
-.. automodule:: optimhc.feature_generator.netMHCIIpan
- :members:
- :undoc-members:
- :show-inheritance:
-
-.. automodule:: optimhc.feature_generator.overlapping_peptide
- :members:
- :undoc-members:
- :show-inheritance:
-
-.. automodule:: optimhc.feature_generator.PWM
- :members:
- :undoc-members:
- :show-inheritance:
-
-.. automodule:: optimhc.feature_generator.spectral_similarity
- :members:
- :undoc-members:
- :show-inheritance:
-
-Rescore Modules
---------------
-
-.. automodule:: optimhc.rescore
- :members:
- :undoc-members:
- :show-inheritance:
-
-Visualization
-------------
-
-.. automodule:: optimhc.visualization
- :members:
- :undoc-members:
- :show-inheritance:
\ No newline at end of file
diff --git a/docs/source/conf.py b/docs/source/conf.py
deleted file mode 100644
index 6d8e1c7..0000000
--- a/docs/source/conf.py
+++ /dev/null
@@ -1,58 +0,0 @@
-# Configuration file for the Sphinx documentation builder.
-#
-# For the full list of built-in configuration values, see the documentation:
-# https://www.sphinx-doc.org/en/master/usage/configuration.html
-
-# -- Project information -----------------------------------------------------
-# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
-
-project = 'optiMHC'
-copyright = '2024, Zixiang Shang'
-author = 'Zixiang Shang'
-release = '0.1.0'
-
-# -- General configuration ---------------------------------------------------
-# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
-
-import os
-import sys
-sys.path.insert(0, os.path.abspath('../..'))
-
-extensions = [
- 'sphinx.ext.autodoc',
- 'sphinx.ext.viewcode',
- 'sphinx.ext.napoleon',
- 'sphinx.ext.intersphinx',
- 'sphinx.ext.mathjax',
- 'sphinx_autodoc_typehints',
- 'nbsphinx'
-]
-
-html_show_sourcelink = False
-
-# Napoleon settings
-napoleon_google_docstring = False
-napoleon_numpy_docstring = True
-napoleon_include_init_with_doc = True
-napoleon_include_private_with_doc = True
-napoleon_include_special_with_doc = True
-napoleon_use_admonition_for_examples = True
-napoleon_use_admonition_for_notes = True
-napoleon_use_admonition_for_references = True
-napoleon_use_ivar = True
-napoleon_use_param = True
-napoleon_use_rtype = True
-napoleon_type_aliases = None
-
-templates_path = ['_templates']
-exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
-
-# -- Options for HTML output -------------------------------------------------
-# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
-
-html_theme = 'sphinx_rtd_theme'
-html_static_path = ['_static']
-
-# -- Options for autodoc ----------------------------------------------------
-autodoc_member_order = 'bysource'
-autodoc_typehints = 'description'
diff --git a/docs/source/examples.rst b/docs/source/examples.rst
deleted file mode 100644
index 4b7c683..0000000
--- a/docs/source/examples.rst
+++ /dev/null
@@ -1,4 +0,0 @@
-Examples
-========
-
-Example notebooks and code will be added here.
\ No newline at end of file
diff --git a/docs/source/index.rst b/docs/source/index.rst
deleted file mode 100644
index 85e26e5..0000000
--- a/docs/source/index.rst
+++ /dev/null
@@ -1,29 +0,0 @@
-.. optiMHC documentation master file, created by
- sphinx-quickstart on Sun May 11 02:30:52 2025.
- You can adapt this file completely to your liking, but it should at least
- contain the root `toctree` directive.
-
-Welcome to optiMHC's documentation!
-================================
-
-.. toctree::
- :maxdepth: 2
- :caption: Contents:
-
- installation
- usage
- api
- examples
-
-Introduction
-------------
-
-optiMHC is a optimum rescoring pipeline for immunopeptidomics data to significantly enhance peptide identification performance.
-
-Indices and tables
-==================
-
-* :ref:`genindex`
-* :ref:`modindex`
-* :ref:`search`
-
diff --git a/docs/source/installation.rst b/docs/source/installation.rst
deleted file mode 100644
index f2a5b38..0000000
--- a/docs/source/installation.rst
+++ /dev/null
@@ -1,20 +0,0 @@
-Installation
-============
-
-Requirements
------------
-
-* Python 3.11 or higher
-* pip or conda
-
-
-Development Installation
-----------------------
-
-To install the development version:
-
-.. code-block:: bash
-
- git clone https://github.com/5h4ng/optimhc.git
- cd optimhc
- pip install -e .
\ No newline at end of file
diff --git a/docs/source/usage.rst b/docs/source/usage.rst
deleted file mode 100644
index d0c08a9..0000000
--- a/docs/source/usage.rst
+++ /dev/null
@@ -1,4 +0,0 @@
-Usage
-=====
-
-Usage instructions will be added here.
\ No newline at end of file
diff --git a/docs/tutorial/examples.md b/docs/tutorial/examples.md
new file mode 100644
index 0000000..a9814ae
--- /dev/null
+++ b/docs/tutorial/examples.md
@@ -0,0 +1,202 @@
+# Examples
+
+OptiMHC ships with example configuration files under the `examples/` directory. This page walks through each one, explaining every section.
+
+## MHC Class I Example
+
+**File:** `examples/classI_example.yaml`
+
+This configuration demonstrates a full Class I immunopeptidomics rescoring workflow with all available features.
+
+```yaml
+experimentName: classI_example
+inputType: pepxml
+inputFile:
+ - ../data/YE_20180428_SK_HLA_A0202_3Ips_a50mio_R1_01.pep.xml
+decoyPrefix: DECOY_
+outputDir: ./examples/results
+visualization: True
+removePreNxtAA: False
+numProcesses: 32
+showProgress: True
+```
+
+**General settings:**
+
+- `experimentName` — a label used for output file naming.
+- `inputType` — the format of your search engine output (`pepxml` or `pin`).
+- `inputFile` — one or more paths to PepXML files.
+- `decoyPrefix` — the prefix used by the search engine to mark decoy protein accessions (default: `DECOY_`).
+- `outputDir` — where results, models, and figures are written.
+- `numProcesses` — number of parallel processes for feature generation.
+
+### Modification Mapping
+
+```yaml
+modificationMap:
+ "147.035385": "UNIMOD:35" # Oxidation (M)
+ "160.030649": "UNIMOD:4" # Carbamidomethyl (C)
+```
+
+Maps the **full modified residue mass** (residue + modification, as recorded in PepXML) to a UNIMOD identifier. This is required by generators that need standardized modification notation (e.g., SpectralSimilarity, DeepLC).
+
+### Allele Settings
+
+```yaml
+allele:
+ - HLA-A*02:02
+```
+
+Specifies the MHC allele(s) for binding prediction tools. For Class I, use standard HLA nomenclature (e.g., `HLA-A*02:01`).
+
+### Features
+
+```yaml
+featureGenerator:
+ - name: Basic
+ - name: SpectralSimilarity
+ params:
+ mzmlDir: ../data
+ spectrumIdPattern: (.+?)\.\d+\.\d+\.\d+
+ model: AlphaPeptDeep_ms2_generic
+ collisionEnergy: 28
+ instrument: LUMOS
+ tolerance: 20
+ numTopPeaks: 36
+ url: 127.0.0.1:8500
+ ssl: false
+ - name: DeepLC
+ params:
+ calibrationCriteria: expect
+ lowerIsBetter: True
+ calibrationSize: 0.1
+ - name: OverlappingPeptide
+ params:
+ minOverlapLength: 7
+ minLength: 7
+ maxLength: 20
+ overlappingScore: expect
+ - name: PWM
+ params:
+ class: I
+ - name: MHCflurry
+ - name: NetMHCpan
+```
+
+Each entry in `featureGenerator` specifies a feature `name` and optional `params`. Features without `params` use their defaults. See the [Features](features/index.md) section for detailed documentation of each feature.
+
+### Rescoring Settings
+
+```yaml
+rescore:
+ testFDR: 0.01
+ model: Percolator
+ numJobs: 4
+```
+
+- `testFDR` — the FDR threshold for the test set (default: 0.01).
+- `model` — the rescoring model: `Percolator` (linear SVM), `XGBoost`, or `RandomForest`.
+- `numJobs` — number of parallel jobs for cross-validation in XGBoost/RandomForest models.
+
+---
+
+## MHC Class II Example
+
+**File:** `examples/classII_example.yaml`
+
+This configuration mirrors the Class I example but is adapted for MHC Class II immunopeptidomics.
+
+```yaml
+experimentName: classII_example
+inputType: pepxml
+inputFile:
+ - ../data/AG20201214_FAIMS_DPB0101_DPA0201_93e6_1hr.pep.xml
+decoyPrefix: DECOY_
+outputDir: ./examples/results
+```
+
+### Key Differences from Class I
+
+**Allele notation** uses the Class II alpha-beta chain format:
+
+```yaml
+allele:
+ - HLA-DPA1*02:01-DPB1*01:01
+```
+
+**OverlappingPeptide** parameters are adjusted for the longer peptide lengths typical of Class II:
+
+```yaml
+- name: OverlappingPeptide
+ params:
+ minOverlapLength: 8
+ minLength: 9
+ maxLength: 50
+```
+
+**PWM** is set to Class II mode, which uses a sliding 9-mer core window with N- and C-flank scoring:
+
+```yaml
+- name: PWM
+ params:
+ class: II
+```
+
+**NetMHCIIpan** replaces MHCflurry and NetMHCpan (which are Class I only):
+
+```yaml
+- name: NetMHCIIpan
+```
+
+---
+
+## Experiment Mode Example
+
+**File:** `examples/experiment_example.yaml`
+
+Experiment mode runs multiple rescoring experiments with different feature subsets on the same input data, allowing you to compare the contribution of individual features.
+
+The general settings and features are defined once at the top. The `experiments` section then defines each experiment:
+
+```yaml
+experiments:
+ - name: "Baseline"
+ source: ["Original"]
+ model: "Percolator"
+ - name: "Complete"
+ source: ["Original", "OverlappingPeptide", "ContigFeatures", "PWM", "Basic"]
+ model: "Percolator"
+ - name: "Shuffle"
+ source: ["Original", "Basic", "OverlappingPeptide", "ContigFeatures", "PWM"]
+ model: "Percolator"
+```
+
+Each experiment specifies:
+
+- `name` — a label for the experiment, used for output subdirectories.
+- `source` — a list of feature sources to include. These correspond to the source names registered by each feature (e.g., `"Original"` from the parser, `"Basic"` from Basic, etc.).
+- `model` — the rescoring model to use for this experiment.
+
+Run experiment mode with:
+
+```bash
+optimhc experiment --config examples/experiment_example.yaml
+```
+
+!!! tip
+ Experiment mode is useful for ablation studies — start with `"Original"` as a baseline, then incrementally add feature sources to measure their impact on peptide identification.
+
+### Available Feature Sources
+
+| Source Name | Feature |
+|---|---|
+| `Original` | PepXML / PIN parser (search engine features) |
+| `Basic` | Basic peptide features |
+| `SpectralSimilarity` | Spectral similarity |
+| `DeepLC` | Retention time deviation |
+| `OverlappingPeptide` | Overlapping peptide score |
+| `ContigFeatures` | Contig-level features from overlapping peptide analysis |
+| `PWM` | Position weight matrix score |
+| `MHCflurry` | MHCflurry binding prediction |
+| `NetMHCpan` | NetMHCpan binding prediction |
+| `NetMHCIIpan` | NetMHCIIpan binding prediction |
diff --git a/docs/tutorial/features/antigen-presentation.md b/docs/tutorial/features/antigen-presentation.md
new file mode 100644
index 0000000..b87d255
--- /dev/null
+++ b/docs/tutorial/features/antigen-presentation.md
@@ -0,0 +1,125 @@
+# Antigen Processing and Presentation Scores
+
+OptiMHC integrates three external tools for predicting MHC binding affinity and antigen presentation. These tools provide complementary information about whether a peptide is likely to be naturally presented on the cell surface by MHC molecules.
+
+## NetMHCpan (MHC Class I)
+
+**Config name:** `NetMHCpan` | **Source name:** `NetMHCpan`
+
+[NetMHCpan 4.1](https://services.healthtech.dtu.dk/services/NetMHCpan-4.1/) predicts peptide binding to MHC Class I molecules using artificial neural networks trained on binding affinity and eluted ligand data.
+
+### Output Columns
+
+| Column | Description |
+|---|---|
+| `netmhcpan_best_score` | Binding score for the best allele |
+| `netmhcpan_best_affinity` | Predicted binding affinity (nM) for the best allele |
+| `netmhcpan_best_percentile_rank` | Percentile rank for the best allele |
+
+### Computation
+
+1. **Preprocess** peptide sequences: strip flanking amino acids and remove modifications.
+2. **Filter** peptides to length 8–30 (the supported range for NetMHCpan).
+3. **Predict** binding for all peptide-allele combinations using `NetMHCpan 4.1`.
+4. **Best allele selection**: for each peptide, select the allele \( a^* \) with the minimum percentile rank \( r \):
+
+\[
+a^* = \arg\min_{a} \; r(\text{peptide}, a)
+\]
+
+Missing values (peptides outside the length range) are filled with the column median.
+
+### Configuration
+
+```yaml
+featureGenerator:
+ - name: NetMHCpan
+```
+
+!!! warning "External installation required"
+ NetMHCpan is a standalone executable that must be downloaded from [DTU Health Tech](https://services.healthtech.dtu.dk/services/NetMHCpan-4.1/) and added to your `PATH`. See [Installation](../../getting-started/installation.md#netmhcpan-netmhciipan-setup) for details.
+
+---
+
+## NetMHCIIpan (MHC Class II)
+
+**Config name:** `NetMHCIIpan` | **Source name:** `NetMHCIIpan`
+
+[NetMHCIIpan 4.3](https://services.healthtech.dtu.dk/services/NetMHCIIpan-4.3/) predicts peptide binding to MHC Class II molecules. It uses binding affinity (BA) mode.
+
+### Output Columns
+
+| Column | Description |
+|---|---|
+| `netmhciipan_best_score` | Binding score for the best allele |
+| `netmhciipan_best_affinity` | Predicted binding affinity (nM) for the best allele |
+| `netmhciipan_best_percentile_rank` | Percentile rank for the best allele |
+
+### Computation
+
+1. **Preprocess** peptide sequences: strip flanking amino acids and remove modifications.
+2. **Filter** peptides to length 9–50 (the supported range for NetMHCIIpan).
+3. **Predict** binding for all peptide-allele combinations using `NetMHCIIpan 4.3 BA`.
+4. **Best allele selection**: for each peptide, select the allele with the minimum percentile rank.
+
+Missing values are filled with the column median.
+
+### Configuration
+
+```yaml
+featureGenerator:
+ - name: NetMHCIIpan
+```
+
+!!! warning "External installation required"
+ NetMHCIIpan is a standalone executable that must be downloaded from [DTU Health Tech](https://services.healthtech.dtu.dk/services/NetMHCIIpan-4.3/) and added to your `PATH`. See [Installation](../../getting-started/installation.md#netmhcpan-netmhciipan-setup) for details.
+
+---
+
+## MHCflurry (MHC Class I)
+
+**Config name:** `MHCflurry` | **Source name:** `MHCflurry`
+
+[MHCflurry](https://github.com/openvax/mhcflurry) provides Class I MHC binding affinity predictions along with antigen processing and presentation scores. Unlike NetMHCpan, MHCflurry also models the antigen processing pathway (proteasomal cleavage and TAP transport) in addition to MHC binding.
+
+### Output Columns
+
+| Column | Description |
+|---|---|
+| `mhcflurry_affinity` | Predicted binding affinity (nM) |
+| `mhcflurry_processing_score` | Antigen processing score (likelihood of proteasomal cleavage and TAP transport) |
+| `mhcflurry_presentation_score` | Combined presentation score integrating binding and processing |
+| `mhcflurry_presentation_percentile` | Percentile rank of the presentation score |
+
+### Computation
+
+1. **Preprocess** peptide sequences: strip flanking amino acids and remove modifications.
+2. **Filter** peptides to length 8–15 (the supported range for MHCflurry).
+3. **Predict** using `Class1PresentationPredictor`, which returns:
+ - **Affinity** — predicted IC50 binding affinity in nM.
+ - **Processing score** — a score reflecting the likelihood that the peptide undergoes proper antigen processing (proteasomal cleavage, TAP transport).
+ - **Presentation score** — a combined score that integrates binding affinity and processing likelihood.
+ - **Presentation percentile** — the percentile rank of the presentation score relative to a reference distribution.
+
+Missing values (peptides outside the 8–15 length range) are filled with the column median.
+
+### Configuration
+
+```yaml
+featureGenerator:
+ - name: MHCflurry
+```
+
+No additional parameters are required. The alleles are taken from the top-level `allele` configuration.
+
+---
+
+## Choosing Between Tools
+
+| Tool | MHC Class | Peptide Length | Scores | External Setup |
+|---|---|---|---|---|
+| NetMHCpan | I | 8–30 | Binding score, affinity, percentile rank | Manual download, add to `PATH` |
+| NetMHCIIpan | II | 9–50 | Binding score, affinity, percentile rank | Manual download, add to `PATH` |
+| MHCflurry | I | 8–15 | Affinity, processing, presentation, percentile | None (pip installed) |
+
+For **MHC Class I** analyses, you can use both NetMHCpan and MHCflurry simultaneously — they provide complementary predictions. For **MHC Class II**, NetMHCIIpan is the only option among these three tools.
diff --git a/docs/tutorial/features/basic.md b/docs/tutorial/features/basic.md
new file mode 100644
index 0000000..7e14a00
--- /dev/null
+++ b/docs/tutorial/features/basic.md
@@ -0,0 +1,71 @@
+# Basic Features
+
+The **Basic** feature (`name: Basic`) computes simple sequence-level properties of each peptide. These features capture compositional and length characteristics that can help distinguish true identifications from random matches.
+
+**Source name:** `Basic`
+
+## Output Columns
+
+| Column | Description |
+|---|---|
+| `length_diff_from_avg` | Peptide length minus the average peptide length across all PSMs |
+| `abs_length_diff_from_avg` | Absolute value of the length difference |
+| `unique_aa_count` | Number of distinct amino acid types in the peptide |
+| `unique_aa_proportion` | Proportion of distinct amino acid types relative to peptide length |
+| `shannon_entropy` | Shannon entropy of the amino acid frequency distribution |
+
+## Computation
+
+### Preprocessing
+
+Before computing features, each peptide sequence undergoes:
+
+1. **Flanking amino acid removal** — strips the preceding/next amino acid notation (e.g., `K.PEPTIDE.R` → `PEPTIDE`).
+2. **Modification removal** — removes bracketed mass values (e.g., `PEPTM[147.035]IDE` → `PEPTMIDE`).
+
+### Length Features
+
+Let \( L \) denote the length of a peptide and \( \bar{L} \) the average peptide length across the entire dataset of \( N \) PSMs:
+
+\[
+\bar{L} = \frac{1}{N} \sum_{i=1}^{N} L_i
+\]
+
+The length difference \( \delta \) and its absolute value are:
+
+\[
+\delta = L - \bar{L}, \quad |\delta| = |L - \bar{L}|
+\]
+
+### Unique Amino Acid Features
+
+Let \( U \) denote the number of distinct amino acid types in a peptide. The unique amino acid proportion \( U_r \) is:
+
+\[
+U_r = \frac{U}{L}
+\]
+
+### Shannon Entropy
+
+The Shannon entropy \( H \) quantifies the diversity of amino acid usage within a peptide:
+
+\[
+H = -\sum_{b \in \mathcal{A}} p_b \log_2(p_b)
+\]
+
+where \( \mathcal{A} \) is the set of unique amino acids in the peptide and \( p_b \) is the frequency of amino acid \( b \):
+
+\[
+p_b = \frac{n_b}{L}
+\]
+
+with \( n_b \) being the count of amino acid \( b \). A peptide composed of a single repeated amino acid has \( H = 0 \). A peptide with all unique amino acids has maximal entropy \( H = \log_2(L) \).
+
+## Configuration
+
+```yaml
+featureGenerator:
+ - name: Basic
+```
+
+No parameters are required. The Basic feature uses default preprocessing settings (`remove_pre_nxt_aa: true`, `remove_modification: true`).
diff --git a/docs/tutorial/features/index.md b/docs/tutorial/features/index.md
new file mode 100644
index 0000000..f943953
--- /dev/null
+++ b/docs/tutorial/features/index.md
@@ -0,0 +1,39 @@
+# Features Overview
+
+OptiMHC uses a modular feature generation system. Each feature computes a set of scoring columns that are added to the PSM data before rescoring. By combining features from multiple orthogonal sources, the rescoring model can better distinguish true peptide identifications from false ones.
+
+## Feature Architecture
+
+All features inherit from `BaseFeatureGenerator` and implement a common interface:
+
+- **`feature_columns`** — the list of output column names.
+- **`id_column`** — the key column(s) used to merge features back into the PsmContainer.
+- **`generate_features()`** — computes and returns a DataFrame of features.
+
+Features are configured in the YAML file as a list of `{name, params}` entries. The pipeline instantiates each feature, calls `generate_features()`, and merges the result into the PsmContainer.
+
+## Feature Categories
+
+| Category | Name | Columns | Description |
+|---|---|---|---|
+| [Original Features](original.md) | PepXML / PIN parser | Variable | Search engine scores and derived mass/ion features |
+| [Basic Features](basic.md) | `Basic` | 5 | Peptide sequence properties (length, entropy, AA composition) |
+| [Spectral Similarity](spectral-similarity.md) | `SpectralSimilarity` | 8 | Similarity between experimental and predicted spectra |
+| [Retention Time Deviation](rt-deviation.md) | `DeepLC` | 5 | Deviation between observed and predicted retention time |
+| [Antigen Presentation Scores](antigen-presentation.md) | `NetMHCpan`, `NetMHCIIpan`, `MHCflurry` | 3–4 per tool | MHC binding affinity and presentation predictions |
+| [PWM Score](pwm.md) | `PWM` | 1–3 per allele | Position weight matrix binding score |
+| [Overlapping Peptide Score](overlapping.md) | `OverlappingPeptide` | 4 | Graph-based peptide overlap and contig assembly features |
+
+## Preprocessing
+
+Most features share common preprocessing steps:
+
+- **Strip flanking amino acids** — remove the preceding and next amino acid notation (e.g., `K.PEPTIDE.R` becomes `PEPTIDE`).
+- **Remove modifications** — strip bracketed mass annotations (e.g., `PEPTM[147.035]IDE` becomes `PEPTMIDE`).
+
+??? note "Selenocysteine (U) handling"
+ Selenocysteine (`U`) is an extremely rare amino acid that most prediction tools do not support. During preprocessing, `U` is automatically replaced with cysteine (`C`) to ensure compatibility with external tools such as Koina, DeepLC, MHCflurry, NetMHCpan, and the PWM scoring matrices.
+
+## Missing Value Handling
+
+When a feature cannot be computed for a PSM (e.g., peptide length outside the supported range for a binding predictor), the missing values are filled with the **median** of the successfully computed values for that column. This ensures no NaN values propagate to the rescoring model.
diff --git a/docs/tutorial/features/original.md b/docs/tutorial/features/original.md
new file mode 100644
index 0000000..60383d1
--- /dev/null
+++ b/docs/tutorial/features/original.md
@@ -0,0 +1,110 @@
+# Original Features
+
+Original features are the initial set of scores and derived quantities extracted during input parsing. They form the baseline feature set (source name: `"Original"`) that every rescoring run includes. The specific features depend on the input format.
+
+## PepXML Features
+
+When parsing PepXML files, OptiMHC extracts raw search engine scores and computes several derived features.
+
+### Search Engine Scores
+
+All `` elements from the PepXML file are extracted as features. The exact columns depend on the search engine that produced the file. For example, Comet produces:
+
+- `xcorr` — cross-correlation score
+- `deltacn` — delta correlation (difference to next-best match)
+- `deltacnstar` — delta CN star
+- `spscore` — preliminary score
+- `sprank` — SP rank
+- `expect` — expectation value (E-value)
+
+Other search engines (X!Tandem, MSFragger, etc.) produce their own score columns, all of which are automatically included.
+
+### Mass Difference Features
+
+Let \( m_\mathrm{exp} \) and \( m_\mathrm{calc} \) denote the experimental and calculated precursor neutral masses, respectively. The mass difference \( \Delta m \) is:
+
+\[
+\Delta m = m_\mathrm{exp} - m_\mathrm{calc}
+\]
+
+The experimental and calculated m/z values are derived from the charge state \( z \) using the proton mass \( m_\mathrm{H} = 1.00727646677 \) Da:
+
+\[
+(m/z)_\mathrm{exp} = \frac{m_\mathrm{exp}}{z} + m_\mathrm{H}, \quad
+(m/z)_\mathrm{calc} = \frac{m_\mathrm{calc}}{z} + m_\mathrm{H}
+\]
+
+The absolute m/z difference \( \Delta_{mz} \) is:
+
+\[
+\Delta_{mz} = \left|(m/z)_\mathrm{exp} - (m/z)_\mathrm{calc}\right|
+\]
+
+### Matched Ion Ratio
+
+When the search engine reports matched and total ions, the matched ion ratio \( R_\mathrm{ion} \) is computed:
+
+\[
+R_\mathrm{ion} = \frac{n_\mathrm{matched}}{n_\mathrm{total}}
+\]
+
+This feature is only included if both values are present and \( n_\mathrm{total} > 0 \) for all PSMs.
+
+### Charge One-Hot Encoding
+
+The precursor charge state is one-hot encoded into binary columns:
+
+- `charge_1`, `charge_2`, `charge_3`, ...
+
+For a PSM with charge 2, `charge_2 = 1` and all other charge columns are 0. This allows the rescoring model to learn charge-specific effects without treating charge as a continuous variable.
+
+### Log Transformation of P-values and E-values
+
+Many search engine scores span several orders of magnitude (e.g., E-values from \( 10^{-20} \) to \( 10^{2} \)). Following the approach used in mokapot, OptiMHC applies automatic log-transformation to compress these ranges into a scale more suitable for machine learning models.
+
+**Scientific notation values:** If values contain scientific notation and span 4 or more orders of magnitude, the log transform for a value \( x = a \times 10^{b} \) is:
+
+\[
+\log_{10}(x) = \log_{10}(a) + b
+\]
+
+For zero values, the log is set to one less than the minimum observed log value.
+
+**Numeric columns:** For non-negative, non-binary numeric columns where \( x_\mathrm{max} / x_\mathrm{min}^{+} \geq 10{,}000 \) (where \( x_\mathrm{min}^{+} \) is the smallest nonzero value):
+
+\[
+x' = \begin{cases}
+\log_{10}(x) & \text{if } x > 0 \\
+\min\!\big(\log_{10}(x_\mathrm{min}^{+})\big) - 1 & \text{if } x = 0
+\end{cases}
+\]
+
+The `num_matched_peptides` column is always log-transformed as \( \log_{10}(x) \).
+
+### Decoy Detection
+
+A PSM is labeled as a **decoy** if the first protein accession starts with the `decoyPrefix` (default: `DECOY_`). If any alternative protein accession does *not* start with the prefix, the PSM is re-labeled as a **target**.
+
+### Additional Metadata
+
+The following columns are extracted but treated as metadata (not used as rescoring features):
+
+- `ms_data_file` — raw file identifier
+- `scan` — scan number
+- `spectrum` — spectrum ID string
+- `label` — target/decoy label
+- `calc_mass` — calculated neutral mass
+- `peptide` — peptide sequence (with modifications)
+- `proteins` — protein accessions
+- `charge` — precursor charge (kept as metadata; one-hot columns are the features)
+- `retention_time` — retention time in seconds
+
+---
+
+## PIN Features
+
+When parsing PIN (Percolator Input) files, the feature set is determined by the file itself. All columns that are not recognized as metadata (`Label`, `ScanNr`, `SpecId`, `Peptide`, `Proteins`, `rank`, `Charge`) are treated as **"Original"** rescoring features.
+
+Column name matching is case-insensitive. Charge columns matching the pattern `charge[_]?\d+` (e.g., `charge1`, `charge_2`) are detected automatically. For each PSM, the charge value is determined by which charge column contains a value of 1.
+
+The Label column uses the convention `1` for target and `-1` for decoy.
diff --git a/docs/tutorial/features/overlapping.md b/docs/tutorial/features/overlapping.md
new file mode 100644
index 0000000..b60fd88
--- /dev/null
+++ b/docs/tutorial/features/overlapping.md
@@ -0,0 +1,108 @@
+# Overlapping Peptide Score
+
+The **OverlappingPeptide** feature (`name: OverlappingPeptide`) is designed for detecting **ladder-like presentation hotspots** — genomic regions where the MHC presentation machinery repeatedly samples overlapping peptides, producing a characteristic nested/ladder pattern in the immunopeptidome.
+
+The method is inspired by the **Overlap-Layout-Consensus (OLC)** algorithm from genome assembly. We adapt the first two stages — overlap detection and layout (contig construction) — to build an overlap graph of peptide sequences and assemble them into contigs.
+
+The core idea: if multiple peptides within the same contig are confidently identified, then other peptides in the same group are also likely to be genuine MHC ligands.
+
+**Source name:** `OverlappingPeptide`
+
+## Output Columns
+
+| Column | Description |
+|---|---|
+| `contig_member_count` | Number of peptides in the contig that this peptide belongs to |
+| `contig_extension_ratio` | How much the contig extends beyond the peptide itself |
+| `contig_member_rank` | Rank of the contig by member count (largest contig = rank 1) |
+| `contig_length` | Length of the assembled contig sequence |
+
+## Computation
+
+### Step 1: Preprocessing and Filtering
+
+Peptide sequences are preprocessed (flanking AA removal, modification removal) and then filtered by length and entropy thresholds.
+
+### Step 2: Redundancy Removal
+
+Peptides that are exact substrings of longer peptides are identified. These **redundant** peptides are removed from the graph but are later mapped to the contig of their containing peptide. This prevents artificial inflation of contig membership.
+
+### Step 3: Overlap Graph Construction
+
+A directed graph \( G = (V, E) \) is built where:
+
+- **Nodes** \( V \) represent non-redundant peptides.
+- **Edge** \( A \to B \in E \) exists if a suffix of peptide \( A \) matches a prefix of peptide \( B \) with overlap length \( \geq \ell_\mathrm{min} \) (the `minOverlapLength` parameter).
+
+For efficient overlap detection, a prefix index is built: for each peptide, all prefixes of length \( \ell_\mathrm{min} \) to \( |seq| \) are stored. Then for each peptide, its suffixes are looked up in the index to find overlapping partners.
+
+When multiple overlaps exist between the same pair of peptides, only the **longest** overlap is kept. The edge weight stores the overlap length.
+
+### Step 4: Transitive Edge Removal
+
+To produce clean, non-branching paths, transitive edges are removed. If path \( A \to B \to C \) exists, the direct edge \( A \to C \) is removed (since the connection is already captured via \( B \)). This extends to longer transitive paths as well.
+
+### Step 5: Contig Assembly
+
+Contigs are identified as **non-branching paths** in the graph — chains of nodes where each internal node has exactly one predecessor and one successor.
+
+Each contig is assembled into a consensus sequence by concatenating the non-overlapping portions. For a contig of peptides \( P_1, P_2, \ldots, P_k \) with overlap lengths \( o_1, o_2, \ldots, o_{k-1} \):
+
+\[
+C = P_1 \,\|\, P_2[o_1\!:] \,\|\, P_3[o_2\!:] \,\|\, \cdots \,\|\, P_k[o_{k-1}\!:]
+\]
+
+where \( P_i[o:] \) denotes the suffix of \( P_i \) beyond the first \( o \) characters, and \( \| \) denotes concatenation.
+
+### Step 6: Peptide-to-Contig Mapping
+
+Each peptide in a contig is assigned the contig's index. Redundant peptides (removed in Step 2) inherit the contig assignment of the peptide that contains them as a substring.
+
+### Step 7: Feature Computation
+
+Let \( n_c \) denote the number of peptides in a contig (including redundant ones), \( L_c \) the length of the assembled contig sequence, and \( L \) the length of the individual peptide.
+
+The **contig member count** is simply \( n_c \).
+
+The **contig length** is \( L_c \).
+
+The **contig member rank** \( R_c \) is the rank of the contig by \( n_c \) in descending order (largest contig = rank 1, ties broken by minimum rank).
+
+The **contig extension ratio** \( E_c \) measures how much the contig extends beyond the peptide:
+
+\[
+E_c = \frac{L_c - L}{L}
+\]
+
+A value of 0 means the peptide spans the entire contig; larger values indicate the contig extends significantly beyond the peptide.
+
+### Missing Values
+
+Peptides that were filtered out (Step 1) receive imputed values. The `fill_missing` parameter controls the strategy:
+
+- `"median"` (default) — fill with the median of each feature column.
+- `"zero"` — fill with zeros.
+
+## Configuration
+
+```yaml
+featureGenerator:
+ - name: OverlappingPeptide
+ params:
+ minOverlapLength: 7 # Minimum suffix-prefix overlap to form an edge
+ minLength: 7 # Minimum peptide length to include
+ maxLength: 20 # Maximum peptide length to include
+ minEntropy: 0 # Minimum Shannon entropy to include
+ overlappingScore: expect # Search engine score column (for internal ranking)
+```
+
+| Parameter | Default | Description |
+|---|---|---|
+| `minOverlapLength` | `6` | Minimum overlap length for edge creation |
+| `minLength` | `7` | Minimum peptide length to include in the graph |
+| `maxLength` | `60` | Maximum peptide length to include |
+| `minEntropy` | `0` | Minimum Shannon entropy threshold |
+| `fill_missing` | `"median"` | Strategy for filling missing values: `"median"` or `"zero"` |
+
+!!! tip
+ For MHC Class I, typical settings are `minLength: 7`, `maxLength: 20`, `minOverlapLength: 7`. For MHC Class II, use wider ranges like `minLength: 9`, `maxLength: 50`, `minOverlapLength: 8` to accommodate longer peptides.
diff --git a/docs/tutorial/features/pwm.md b/docs/tutorial/features/pwm.md
new file mode 100644
index 0000000..985d09d
--- /dev/null
+++ b/docs/tutorial/features/pwm.md
@@ -0,0 +1,106 @@
+# PWM Score
+
+The **PWM** feature (`name: PWM`) scores peptides against Position Weight Matrices derived from known MHC ligand data. PWM scoring is a classical, fast, and interpretable method for estimating MHC binding potential.
+
+**Source name:** `PWM`
+
+The PWM matrices used in OptiMHC are derived from the [SysteMHC Atlas](https://systemhc.sjtu.edu.cn/), a public data repository of immunopeptidomics datasets generated by mass spectrometry:
+
+> Shao, W.; Pedrioli, P.G.A. *et al.* The SysteMHC Atlas project. *Nucleic Acids Res* (2018). [doi:10.1093/nar/gkx664](https://doi.org/10.1093/nar/gkx664)
+>
+> Huang, X.; Gan, Z. *et al.* The SysteMHC Atlas v2.0, an updated resource for mass spectrometry-based immunopeptidomics. *Nucleic Acids Res* (2024). [doi:10.1093/nar/gkad1068](https://doi.org/10.1093/nar/gkad1068)
+
+## Output Columns
+
+The number and names of output columns depend on the MHC class and alleles.
+
+### MHC Class I
+
+| Column | Description |
+|---|---|
+| `PWM_Score_{allele}` | Total PWM score for the peptide |
+| `Anchor_Score_{allele}` | PWM score at the most conserved (anchor) positions only |
+
+### MHC Class II
+
+| Column | Description |
+|---|---|
+| `PWM_Score_{allele}` | Best 9-mer core binding score |
+| `N_Flank_PWM_Score_{allele}` | Score for the N-terminal flanking residues |
+| `C_Flank_PWM_Score_{allele}` | Score for the C-terminal flanking residues |
+
+## Computation
+
+### MHC Class I Scoring
+
+For Class I, a length-specific PWM is loaded for each allele and peptide length. Each entry \( W_{a,j} \) represents the log-odds weight for amino acid \( a \) at position \( j \).
+
+**Total PWM score.** The PWM score \( S \) for a peptide of length \( L \) with amino acid \( a_j \) at position \( j \) is:
+
+\[
+S = \sum_{j=1}^{L} W_{a_j, j}
+\]
+
+**Anchor score.** The anchor positions are identified as the \( n \) most conserved positions in the PWM (default \( n = 2 \)). Conservation is measured by the positional Shannon entropy \( H_j \):
+
+\[
+H_j = -\sum_{a \in \mathcal{A}} p_{a,j} \log_2(p_{a,j})
+\]
+
+where the probabilities are derived from the PWM weights:
+
+\[
+p_{a,j} = \frac{2^{W_{a,j}}}{\sum_{a'} 2^{W_{a',j}}}
+\]
+
+The \( n \) positions with the **lowest** \( H_j \) (highest conservation) are selected as anchor positions. The anchor score \( S_\mathrm{anc} \) is:
+
+\[
+S_\mathrm{anc} = \sum_{j \in \mathcal{J}_\mathrm{anc}} W_{a_j, j}
+\]
+
+where \( \mathcal{J}_\mathrm{anc} \) is the set of anchor positions.
+
+### MHC Class II Scoring
+
+MHC Class II molecules bind peptides through a 9-mer binding core embedded within a longer peptide. The PWM is always 9 positions long.
+
+**Core binding score.** A sliding window scans all possible 9-mer frames, and the frame with the highest score is selected. Let \( s \) be the starting position:
+
+\[
+S = \max_{s} \sum_{j=1}^{9} W_{a_{s+j}, j}
+\]
+
+**Flanking scores.** Once the best core position is determined, the flanking residues are scored:
+
+- **N-terminal flank**: up to 3 residues immediately preceding the binding core. If fewer than 3 residues are available, the sequence is padded with `X` (unknown amino acid).
+- **C-terminal flank**: up to 3 residues immediately following the binding core, similarly padded.
+
+Each flank is scored against its own 3-position PWM. Let \( W^{(N)} \) and \( W^{(C)} \) denote the N-flank and C-flank matrices. The flanking scores \( S_N \) and \( S_C \) are:
+
+\[
+S_N = \sum_{j=1}^{3} W^{(N)}_{a_j, j}, \quad
+S_C = \sum_{j=1}^{3} W^{(C)}_{a_j, j}
+\]
+
+### Preprocessing
+
+- Flanking amino acids are stripped and modifications are removed.
+- Peptides for which no matching PWM is available (e.g., unsupported length for Class I) receive median-imputed values.
+
+## Configuration
+
+```yaml
+featureGenerator:
+ - name: PWM
+ params:
+ class: I # "I" or "II"
+ anchors: 2 # Number of anchor positions (Class I only)
+```
+
+| Parameter | Default | Description |
+|---|---|---|
+| `class` | *(required)* | MHC class: `"I"` or `"II"` |
+| `anchors` | `2` | Number of anchor positions for anchor score computation (Class I only) |
+
+The alleles are taken from the top-level `allele` configuration.
diff --git a/docs/tutorial/features/rt-deviation.md b/docs/tutorial/features/rt-deviation.md
new file mode 100644
index 0000000..4559517
--- /dev/null
+++ b/docs/tutorial/features/rt-deviation.md
@@ -0,0 +1,81 @@
+# Retention Time Deviation
+
+The **DeepLC** feature (`name: DeepLC`) predicts peptide retention times using the [DeepLC](https://github.com/compomics/DeepLC) deep learning model and computes the deviation between predicted and observed retention times. Correct peptide identifications should show smaller deviations, while incorrect matches tend to have larger discrepancies.
+
+**Source name:** `DeepLC`
+
+## Output Columns
+
+| Column | Description |
+|---|---|
+| `observed_retention_time` | Retention time recorded in the input data (seconds) |
+| `predicted_retention_time` | Retention time predicted by DeepLC |
+| `retention_time_diff` | Signed difference: predicted minus observed |
+| `abs_retention_time_diff` | Absolute value of the retention time difference |
+| `retention_time_ratio` | Ratio of the smaller to the larger of predicted and observed |
+
+## Computation
+
+### Step 1: Preprocessing
+
+Peptide sequences are preprocessed for DeepLC input:
+
+1. **Flanking amino acids** are stripped (e.g., `K.PEPTIDE.R` → `PEPTIDE`).
+2. **Modifications** are converted from mass-annotated format to UNIMOD notation using the `modificationMap`. For example, `M[147.035]` is converted to a UNIMOD-indexed modification string that DeepLC understands.
+
+### Step 2: Calibration
+
+DeepLC benefits from calibration on high-confidence PSMs from the same LC-MS run. The calibration procedure:
+
+1. **Sort** all PSMs by the `calibrationCriteria` column (a search engine score). If `lowerIsBetter` is true, sort ascending; otherwise, sort descending.
+2. **Select** the top PSMs as the calibration set:
+ - If `calibrationSize` is a float (e.g., 0.1), take the top 10% of PSMs.
+ - If `calibrationSize` is an integer (e.g., 500), take the top 500 PSMs.
+3. **Filter** to target PSMs only (decoys are excluded from calibration).
+4. **Calibrate** the DeepLC predictor using the observed retention times of the calibration set.
+
+### Step 3: Prediction
+
+The calibrated DeepLC model predicts retention times for all PSMs in the dataset.
+
+### Step 4: Feature Computation
+
+Let \( t_\mathrm{obs} \) and \( t_\mathrm{pred} \) denote the observed and predicted retention times, respectively.
+
+The signed difference \( \Delta t \), its absolute value \( |\Delta t| \), and the retention time ratio \( R_t \) are:
+
+\[
+\Delta t = t_\mathrm{pred} - t_\mathrm{obs}
+\]
+
+\[
+|\Delta t| = |t_\mathrm{pred} - t_\mathrm{obs}|
+\]
+
+\[
+R_t = \frac{\min(t_\mathrm{pred},\; t_\mathrm{obs})}{\max(t_\mathrm{pred},\; t_\mathrm{obs})}
+\]
+
+\( R_t \) is bounded in \((0, 1]\) and equals 1 when the predicted and observed times are identical.
+
+### Missing Values
+
+Any PSMs for which DeepLC cannot produce a prediction are filled with the **median** of each feature column.
+
+## Configuration
+
+```yaml
+featureGenerator:
+ - name: DeepLC
+ params:
+ calibrationCriteria: expect # Column name used for selecting calibration PSMs
+ lowerIsBetter: true # Whether lower values of the criteria are better
+ calibrationSize: 0.1 # Fraction (float) or count (int) of top PSMs for calibration
+```
+
+| Parameter | Default | Description |
+|---|---|---|
+| `calibrationCriteria` | *(required)* | Name of a search engine score column to rank PSMs for calibration |
+| `lowerIsBetter` | `false` | Set to `true` if lower values of the calibration criteria indicate better PSMs (e.g., E-values) |
+| `calibrationSize` | `0.15` | Fraction of PSMs (float) or absolute count (int) for the calibration set |
+
diff --git a/docs/tutorial/features/spectral-similarity.md b/docs/tutorial/features/spectral-similarity.md
new file mode 100644
index 0000000..d216eee
--- /dev/null
+++ b/docs/tutorial/features/spectral-similarity.md
@@ -0,0 +1,152 @@
+# Spectral Similarity
+
+The **SpectralSimilarity** feature (`name: SpectralSimilarity`) compares experimentally observed MS2 spectra against theoretical spectra predicted by a deep learning model served via [Koina](https://koina.proteomicsdb.org/). Koina is an open-source, web-accessible model repository that democratizes access to machine learning models for proteomics research, enabling remote execution of prediction models via standard HTTP/S requests without requiring specialized hardware ([Lautenbacher *et al.*, Nature Communications, 2025](https://doi.org/10.1038/s41467-025-64870-5)). The similarity between observed and predicted fragmentation patterns is a powerful indicator of correct peptide-spectrum matches.
+
+**Source name:** `SpectralSimilarity`
+
+## Output Columns
+
+| Column | Description |
+|---|---|
+| `spectral_angle_similarity` | Normalized spectral angle between experimental and predicted spectra |
+| `cosine_similarity` | Cosine similarity between intensity vectors |
+| `pearson_correlation` | Pearson correlation coefficient |
+| `spearman_correlation` | Spearman rank correlation coefficient |
+| `mean_squared_error` | Mean squared error on L2-normalized intensity vectors |
+| `unweighted_entropy_similarity` | Entropy-based similarity measure |
+| `predicted_seen_nonzero` | Count of predicted peaks that were observed in the experimental spectrum |
+| `predicted_not_seen` | Count of predicted peaks that were not observed |
+
+## Computation
+
+### Step 1: Spectrum Extraction
+
+Experimental MS2 spectra are extracted from mzML files. Each spectrum is associated with a PSM through the spectrum ID pattern. The m/z and intensity arrays are sorted by m/z.
+
+### Step 2: Theoretical Spectrum Prediction
+
+Peptide sequences (with modifications mapped to UNIMOD notation via `modificationMap`) and charge states are sent to a Koina server, which returns predicted fragment ion m/z values and intensities using the specified deep learning model (e.g., `AlphaPeptDeep_ms2_generic`).
+
+### Step 3: Peak Alignment
+
+For each predicted peak at m/z value \( m \), a tolerance window is computed in parts-per-million (ppm):
+
+\[
+m_\mathrm{low} = m \cdot \left(1 - \frac{\tau}{10^6}\right), \quad
+m_\mathrm{high} = m \cdot \left(1 + \frac{\tau}{10^6}\right)
+\]
+
+where \( \tau \) is the ppm tolerance (default 20). Within this window, the experimental peak with the **highest intensity** is selected as the match. If no experimental peak falls within the window, the aligned experimental intensity is set to 0.
+
+This produces an aligned experimental intensity vector \( \mathbf{e} \) of the same length as the predicted intensity vector \( \mathbf{p} \).
+
+### Step 4: Top-N Peak Selection
+
+Only the top \( N \) peaks by predicted intensity are retained for similarity computation (default \( N = 36 \)). This focuses the comparison on the most informative fragment ions.
+
+### Step 5: Similarity Metrics
+
+All metrics are computed on the aligned, top-N intensity vectors \( \mathbf{e} \) (experimental) and \( \mathbf{p} \) (predicted).
+
+#### Spectral Angle Similarity
+
+The spectral angle similarity \( S_\mathrm{SA} \) is derived from the cosine of the angle \( \theta \) between the two vectors:
+
+\[
+\cos\theta = \frac{\mathbf{e} \cdot \mathbf{p}}{\|\mathbf{e}\| \, \|\mathbf{p}\|}
+\]
+
+\[
+S_\mathrm{SA} = 1 - \frac{2 \arccos(\cos\theta)}{\pi}
+\]
+
+\( S_\mathrm{SA} \) ranges from 0 (orthogonal spectra) to 1 (identical spectra). The \( 2/\pi \) normalization maps the angle from \([0, \pi/2]\) to \([0, 1]\).
+
+#### Cosine Similarity
+
+The cosine similarity \( S_{\mathrm{cos}} \) is:
+
+\[
+S_{\mathrm{cos}} = \frac{\mathbf{e} \cdot \mathbf{p}}{\|\mathbf{e}\| \, \|\mathbf{p}\|}
+\]
+
+#### Pearson Correlation
+
+The Pearson correlation coefficient \( r \) is:
+
+\[
+r = \frac{\sum_{i} (e_i - \bar{e})(p_i - \bar{p})}{\sqrt{\sum_{i} (e_i - \bar{e})^2} \sqrt{\sum_{i} (p_i - \bar{p})^2}}
+\]
+
+where \( \bar{e} \) and \( \bar{p} \) are the means of the experimental and predicted vectors.
+
+#### Spearman Correlation
+
+The Spearman correlation \( \rho \) is the Pearson correlation computed on the **ranks** of the values (using average ranking for ties) rather than the values themselves.
+
+#### Mean Squared Error
+
+The MSE is computed on L2-normalized vectors. Let \( \hat{e}_i = e_i / \|\mathbf{e}\| \) and \( \hat{p}_i = p_i / \|\mathbf{p}\| \). Then:
+
+\[
+\mathrm{MSE} = \frac{1}{n} \sum_{i=1}^{n} (\hat{e}_i - \hat{p}_i)^2
+\]
+
+#### Unweighted Entropy Similarity
+
+The unweighted entropy similarity is based on the concept of spectral entropy introduced by Li *et al.* ([Nature Methods, 2021](https://doi.org/10.1038/s41592-021-01331-z)).
+
+First, normalize intensities to probability distributions \( \mathbf{q}^{(e)} \) and \( \mathbf{q}^{(p)} \):
+
+\[
+q_i^{(e)} = \frac{e_i}{\sum_j e_j}, \quad q_i^{(p)} = \frac{p_i}{\sum_j p_j}
+\]
+
+Compute the Shannon entropy of each distribution and their mixture \( \mathbf{m} \):
+
+\[
+S_e = -\sum_i q_i^{(e)} \ln q_i^{(e)}, \quad
+S_p = -\sum_i q_i^{(p)} \ln q_i^{(p)}
+\]
+
+\[
+m_i = \frac{1}{2}\left(q_i^{(e)} + q_i^{(p)}\right), \quad
+S_m = -\sum_i m_i \ln m_i
+\]
+
+The entropy similarity \( S_\mathrm{ent} \) is:
+
+\[
+S_\mathrm{ent} = 1 - \frac{2 S_m - S_e - S_p}{\ln 4}
+\]
+
+The numerator is related to the Jensen-Shannon divergence. Dividing by \( \ln 4 \) normalizes the result to \([0, 1]\).
+
+#### Peak Matching Counts
+
+- **`predicted_seen_nonzero`** — the number of predicted peaks (with intensity > 0) that have a matched experimental peak with intensity > 0.
+- **`predicted_not_seen`** — the number of predicted peaks (with intensity > 0) that have no matching experimental peak (aligned intensity = 0).
+
+## Configuration
+
+```yaml
+featureGenerator:
+ - name: SpectralSimilarity
+ params:
+ mzmlDir: ../data # Directory containing mzML files
+ spectrumIdPattern: (.+?)\.\d+\.\d+\.\d+ # Regex to link PSMs to mzML files
+ model: AlphaPeptDeep_ms2_generic # Koina prediction model
+ collisionEnergy: 28 # Collision energy for prediction
+ instrument: LUMOS # Instrument type for prediction
+ tolerance: 20 # Peak matching tolerance in ppm
+ numTopPeaks: 36 # Number of top peaks to compare
+ url: koina.wilhelmlab.org:443 # Koina server gRPC endpoint
+ ssl: true # Use SSL for gRPC connection
+```
+
+!!! note
+ For a local Koina/Triton server, set `url: 127.0.0.1:8500` and `ssl: false`. The default public endpoint is `koina.wilhelmlab.org:443` with SSL enabled.
+
+## Requirements
+
+This feature requires access to a **Koina server** — either the [public endpoint](https://koina.proteomicsdb.org/) or a self-hosted [Triton Inference Server](https://github.com/wilhelm-lab/koina).
diff --git a/docs/tutorial/index.md b/docs/tutorial/index.md
new file mode 100644
index 0000000..ceb4cf5
--- /dev/null
+++ b/docs/tutorial/index.md
@@ -0,0 +1,13 @@
+# Tutorial
+
+This section provides a hands-on guide to using OptiMHC effectively.
+
+## What You Will Learn
+
+- **[Examples](examples.md)** — Walk through complete configuration files for MHC Class I, Class II, and experiment-mode analyses.
+- **[Pipeline Workflow](workflow.md)** — Understand each stage of the pipeline from input parsing through rescoring to visualization.
+- **[Features](features/index.md)** — Deep dive into every feature: what it computes, the underlying formulas, and how to configure it.
+
+## Prerequisites
+
+Before starting the tutorials, make sure you have [installed OptiMHC](../getting-started/installation.md) and verified it with `optimhc --help`. If you plan to use advanced features (SpectralSimilarity, DeepLC, MHCflurry, NetMHCpan), ensure their external dependencies are also installed.
diff --git a/docs/tutorial/workflow.md b/docs/tutorial/workflow.md
new file mode 100644
index 0000000..780c1bf
--- /dev/null
+++ b/docs/tutorial/workflow.md
@@ -0,0 +1,149 @@
+# Pipeline Workflow
+
+This page explains the complete OptiMHC pipeline from input to output. Understanding the workflow helps you choose the right configuration for your data and troubleshoot issues.
+
+## Overview
+
+The pipeline executes in five sequential stages:
+
+```
+1. Configuration loading
+2. Input parsing
+3. Feature generation
+4. Rescoring
+5. Output & visualization
+```
+
+## Stage 1: Configuration
+
+OptiMHC uses a layered configuration system. Settings are resolved in the following precedence order (highest to lowest):
+
+1. **CLI flags** — command-line arguments override everything.
+2. **YAML file** — values from the `--config` file.
+3. **Default config** — built-in defaults for all settings.
+
+The default configuration provides sensible starting values:
+
+```yaml
+outputDir: ./results
+inputType: pepxml
+decoyPrefix: DECOY_
+visualization: true
+saveModels: true
+toFlashLFQ: true
+numProcesses: 4
+logLevel: INFO
+rescore:
+ testFDR: 0.01
+ trainFDR: 0.01
+ model: Percolator
+ numJobs: 1
+```
+
+The `Config` class deep-merges your YAML file with these defaults, so you only need to specify what differs.
+
+## Stage 2: Input Parsing
+
+OptiMHC accepts two input formats:
+
+### PepXML
+
+The PepXML parser extracts PSMs from the XML structure produced by search engines (e.g., Comet, X!Tandem). For each PSM it extracts:
+
+- Spectrum metadata (scan number, spectrum ID, charge, retention time)
+- Mass values (experimental and calculated neutral mass)
+- Peptide sequence with modifications
+- Protein accessions
+- All search engine scores
+
+The parser then computes derived features: mass differences, m/z differences, matched ion ratios, log-transformed p-values, and charge one-hot encoding. These become the **"Original"** feature set. See [Original Features](features/original.md) for details.
+
+### PIN (Percolator Input)
+
+The PIN parser reads tab-separated Percolator input files. All columns that are not metadata (Label, ScanNr, SpecId, Peptide, Proteins) are treated as the **"Original"** feature set.
+
+### PsmContainer
+
+Regardless of input format, parsing produces a `PsmContainer` — the central data structure of the pipeline. It wraps a pandas DataFrame of PSMs and maintains a registry of feature groups:
+
+```python
+psms.rescoring_features = {
+ "Original": ["xcorr", "deltacn", "mass_diff", ...],
+}
+```
+
+Every subsequent feature adds its own entry to this registry. This design allows experiment mode to select specific feature subsets by source name.
+
+## Stage 3: Feature Generation
+
+The pipeline iterates over the `featureGenerator` list from the configuration. For each entry, it:
+
+1. Instantiates the feature class by name.
+2. Calls `generate_features()`, which returns a DataFrame.
+3. Merges the result into the PsmContainer via `add_features()` (join by key columns) or `add_features_by_index()` (join by DataFrame index).
+4. Registers the new columns under the feature's source name in `rescoring_features`.
+
+After all generators have run, the PsmContainer holds the complete feature matrix.
+
+Available generators are documented in detail in the [Features](features/index.md) section:
+
+| Feature | Source Name | Join Key |
+|---|---|---|
+| Basic | `Basic` | index |
+| SpectralSimilarity | `SpectralSimilarity` | spectrum + peptide + charge |
+| DeepLC | `DeepLC` | index |
+| OverlappingPeptide | `OverlappingPeptide` | peptide |
+| PWM | `PWM` | peptide |
+| MHCflurry | `MHCflurry` | peptide |
+| NetMHCpan | `NetMHCpan` | peptide |
+| NetMHCIIpan | `NetMHCIIpan` | peptide |
+
+## Stage 4: Rescoring
+
+Rescoring uses the [mokapot](https://mokapot.readthedocs.io/) framework. The pipeline:
+
+1. **Builds a mokapot dataset** — converts the PsmContainer into a `LinearPsmDataset` with the selected rescoring features, target/decoy labels, spectrum IDs, and peptide sequences.
+2. **Trains a model** — `mokapot.brew()` performs semi-supervised learning with 3-fold cross-validation:
+ - Trains the model on the training fold.
+ - Scores PSMs in the test fold.
+ - Repeats for all folds.
+3. **Assigns q-values** using target-decoy competition at the specified `testFDR`.
+
+### Available Models
+
+| Model | Description |
+|---|---|
+| **Percolator** | Linear SVM (default). Fast and robust. Uses `mokapot.PercolatorModel`. |
+| **XGBoost** | Gradient-boosted trees. Hyperparameters tuned via `GridSearchCV` (3-fold CV, ROC-AUC). Searches over `scale_pos_weight`, `max_depth`, `min_child_weight`, `gamma`. |
+| **RandomForest** | Random forest classifier. Hyperparameters tuned via `GridSearchCV` (3-fold CV, ROC-AUC). Searches over `class_weight`, `max_depth`, `min_samples_split`, `min_impurity_decrease`. |
+
+## Stage 5: Output & Visualization
+
+### Output Files
+
+- **mokapot result files** — PSM-level and peptide-level results with scores and q-values.
+- **PIN file** — the complete feature matrix in Percolator input format (useful for downstream tools).
+- **Models** — serialized rescoring models (when `saveModels: true`).
+- **FlashLFQ file** — quantification-ready output (when `toFlashLFQ: true`).
+
+### Visualizations
+
+When `visualization: true`, the pipeline produces:
+
+| Plot | Description |
+|---|---|
+| **qvalues.png** | Number of PSMs and peptides accepted at each q-value threshold. |
+| **feature_importance.png** | Bar chart showing the weight or importance of each feature in the trained model. |
+| **feature_correlation.png** | Heatmap of pairwise Pearson correlations among all rescoring features. |
+| **target_decoy_histogram.png** | KDE histograms comparing the distribution of each feature for targets vs. decoys (top-ranked hits only). |
+
+## Experiment Mode
+
+Experiment mode (`optimhc experiment --config ...`) shares stages 1–3 with the standard pipeline but then runs multiple rescoring experiments in parallel. Each experiment uses a different subset of feature sources and/or a different model.
+
+This is useful for:
+
+- **Ablation studies** — measuring the contribution of each feature group.
+- **Model comparison** — comparing Percolator vs. XGBoost vs. RandomForest on the same data.
+
+Each experiment runs in its own process and writes results to a separate subdirectory. A shared PIN file and feature correlation plot are generated once for the full feature set.
diff --git a/mkdocs.yml b/mkdocs.yml
new file mode 100644
index 0000000..d72d5d0
--- /dev/null
+++ b/mkdocs.yml
@@ -0,0 +1,93 @@
+site_name: OptiMHC
+site_description: A high-performance rescoring pipeline for immunopeptidomics data to significantly enhance peptide identification performance
+site_author: Zixiang Shang
+repo_url: https://github.com/5h4ng/OptiMHC
+repo_name: 5h4ng/OptiMHC
+
+theme:
+ name: material
+ palette:
+ - scheme: default
+ primary: indigo
+ accent: indigo
+ toggle:
+ icon: material/brightness-7
+ name: Switch to dark mode
+ - scheme: slate
+ primary: indigo
+ accent: indigo
+ toggle:
+ icon: material/brightness-4
+ name: Switch to light mode
+ features:
+ - navigation.instant
+ - navigation.tracking
+ - navigation.tabs
+ - navigation.sections
+ - navigation.indexes
+ - navigation.top
+ - search.suggest
+ - search.highlight
+ - content.code.copy
+
+plugins:
+ - search
+ - mkdocstrings:
+ handlers:
+ python:
+ options:
+ show_source: true
+ show_root_heading: true
+ show_root_full_path: false
+ members_order: source
+ docstring_style: numpy
+ merge_init_into_class: true
+ show_if_no_docstring: false
+
+markdown_extensions:
+ - admonition
+ - pymdownx.details
+ - pymdownx.superfences
+ - pymdownx.highlight:
+ anchor_linenums: true
+ - pymdownx.inlinehilite
+ - pymdownx.snippets
+ - pymdownx.tabbed:
+ alternate_style: true
+ - toc:
+ permalink: true
+ - pymdownx.arithmatex:
+ generic: true
+
+extra_javascript:
+ - javascripts/mathjax.js
+ - https://unpkg.com/mathjax@3/es5/tex-mml-chtml.js
+
+nav:
+ - Home: index.md
+ - Getting Started:
+ - Installation: getting-started/installation.md
+ - Quick Start: getting-started/quickstart.md
+ - Tutorial:
+ - Overview: tutorial/index.md
+ - Examples: tutorial/examples.md
+ - Pipeline Workflow: tutorial/workflow.md
+ - Features:
+ - Overview: tutorial/features/index.md
+ - Original Features: tutorial/features/original.md
+ - Basic Features: tutorial/features/basic.md
+ - Spectral Similarity: tutorial/features/spectral-similarity.md
+ - Retention Time Deviation: tutorial/features/rt-deviation.md
+ - Antigen Presentation Scores: tutorial/features/antigen-presentation.md
+ - PWM Score: tutorial/features/pwm.md
+ - Overlapping Peptide Score: tutorial/features/overlapping.md
+ - API Reference:
+ - Overview: api/index.md
+ - Core: api/core.md
+ - I/O: api/io.md
+ - Feature Generators: api/features.md
+ - Rescoring: api/rescore.md
+ - Visualization: api/visualization.md
+ - CLI: api/cli.md
+ - Development:
+ - development/index.md