diff --git a/README.md b/README.md index bc4cb04..a79b20e 100644 --- a/README.md +++ b/README.md @@ -71,9 +71,10 @@ Rescore parameters control how the rescoring step is executed and include: | Parameter | Type | Example | Description | | --------- | ------- | ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `testFDR` | Float | `0.01` | The false-discovery rate threshold at which to evaluate the learned models. | -| `model` | String | `Percolator` | Model to use for rescoring (valid options include `Percolator`, `XGBoost`, or `RandomForest`). | -| `numJobs` | Integer | `4` | The number of parallel jobs to run. This value is passed to Scikit-learn's n_jobs parameter to control parallelism for model training or scoring. Set to -1 to use all available CPU cores. | +| `testFDR` | Float | `0.01` | The false-discovery rate threshold at which to evaluate the learned models and report final results. | +| `trainFDR` | Float | `0.01` | The FDR threshold used during model training to select positive PSMs in each iteration. Increase this value (e.g. `0.05`) if training fails with "No PSMs found below the eval_fdr" on challenging datasets. | +| `model` | String | `Percolator` | Model to use for rescoring (valid options include `Percolator`, `XGBoost`, or `RandomForest`). | +| `numJobs` | Integer | `4` | The number of parallel jobs to run. This value is passed to Scikit-learn's n_jobs parameter to control parallelism for model training or scoring. Set to -1 to use all available CPU cores. | #### Example YAML Configuration diff --git a/optimhc/core/config.py b/optimhc/core/config.py index eeb3071..e3dfa4b 100644 --- a/optimhc/core/config.py +++ b/optimhc/core/config.py @@ -20,7 +20,7 @@ "removePreNxtAA": False, "showProgress": True, "logLevel": "INFO", - "rescore": {"testFDR": 0.01, "model": "Percolator", "numJobs": 1}, + "rescore": {"testFDR": 0.01, "trainFDR": 0.01, "model": "Percolator", "numJobs": 1}, } diff --git a/optimhc/core/pipeline.py b/optimhc/core/pipeline.py index 054a410..6574ec4 100644 --- a/optimhc/core/pipeline.py +++ b/optimhc/core/pipeline.py @@ -73,6 +73,7 @@ def __init__(self, config): self.save_models = self.config.get("saveModels", True) self.to_flashlfq = self.config.get("toFlashLFQ", True) self.test_fdr = self.config.get("rescore", {}).get("testFDR", 0.01) + self.train_fdr = self.config.get("rescore", {}).get("trainFDR", 0.01) self.model_type = self.config.get("rescore", {}).get("model", "Percolator") self.n_jobs = self.config.get("rescore", {}).get("numJobs", 1) @@ -143,7 +144,6 @@ def rescore(self, psms, model_type=None, n_jobs=None, test_fdr=None, rescoring_f Number of parallel jobs. test_fdr : float, optional FDR threshold. - rescoring_features : list, optional List of features to use for rescoring. Returns @@ -161,14 +161,15 @@ def rescore(self, psms, model_type=None, n_jobs=None, test_fdr=None, rescoring_f model_type = model_type if model_type is not None else self.model_type n_jobs = n_jobs if n_jobs is not None else self.n_jobs + train_fdr = getattr(self, "train_fdr", 0.01) if model_type == "XGBoost": - model = XGBoostPercolatorModel(n_jobs=n_jobs) + model = XGBoostPercolatorModel(train_fdr=train_fdr, n_jobs=n_jobs) elif model_type == "RandomForest": - model = RandomForestPercolatorModel(n_jobs=n_jobs) + model = RandomForestPercolatorModel(train_fdr=train_fdr, n_jobs=n_jobs) elif model_type == "Percolator": - model = PercolatorModel(n_jobs=n_jobs) + model = PercolatorModel(train_fdr=train_fdr, n_jobs=n_jobs) else: - model = PercolatorModel(n_jobs=n_jobs) + model = PercolatorModel(train_fdr=train_fdr, n_jobs=n_jobs) kwargs = {} if rescoring_features is not None: diff --git a/optimhc/rescore/mokapot.py b/optimhc/rescore/mokapot.py index 341662e..53cfc06 100644 --- a/optimhc/rescore/mokapot.py +++ b/optimhc/rescore/mokapot.py @@ -58,7 +58,8 @@ def rescore( """ psms = convert_to_mokapot_dataset(psms, rescoring_features=rescoring_features) logger.info("Rescoring PSMs with mokapot.") - results, models = mokapot.brew(psms, model=model, test_fdr=test_fdr, **kwargs) + model_arg = [model] if model is not None else None + results, models = mokapot.brew(psms, model=model_arg, test_fdr=test_fdr, **kwargs) return results, models