diff --git a/modules.nf b/modules.nf index 1f3dcb7..3cafef4 100644 --- a/modules.nf +++ b/modules.nf @@ -12,7 +12,9 @@ process PROCESS_BINDINGDB { script: """ - python "${params.scripts}/process_bindingdb.py" --input-dir "${params.bindingDB}" + plumbline process-bindingdb \ + --input-directory "${params.bindingDB}" \ + --output-directory "./" """ } process DOWNLOAD_PDB { @@ -30,7 +32,8 @@ process DOWNLOAD_PDB { script: """ - python "${params.scripts}/download_pdb.py" --input-json "${input_json}" + plumbline download-pdb \ + --input-file "${input_json}" """ } process PREP_CIF { @@ -51,7 +54,11 @@ process PREP_CIF { script: """ - python "${params.scripts}/prep_cif.py" --input-json "${input_json}" --input-cif "${input_cif}" --fasta-sequence "${params.fasta}" + plumbline prep-cif \ + --input-json "${input_json}" \ + --input-cif "${input_cif}" \ + --fasta-sequence "${params.fasta}" \ + --output-directory "./" """ } process PREP_FOR_DOCKING { @@ -71,7 +78,10 @@ process PREP_FOR_DOCKING { script: """ - asap-cli protein-prep --target SARS-CoV-2-Mpro --pdb-file "${prepped_pdb}" --output-dir "./" + plumbline prep-protein-for-docking \ + --target SARS-CoV-2-Mpro \ + --pdb-file "${prepped_pdb}" \ + --output-dir "./" """ } process ASSESS_PREPPED_PROTEIN { @@ -88,7 +98,9 @@ process ASSESS_PREPPED_PROTEIN { script: """ - python "${params.scripts}/assess_prepped_protein.py" --input-oedu "${design_unit}" + plumbline assess-prepped-protein \ + --input-file "${design_unit}" \ + --output-directory "./" """ } process GENERATE_CONSTRAINED_LIGAND_POSES { @@ -105,7 +117,10 @@ process GENERATE_CONSTRAINED_LIGAND_POSES { script: """ - python "${params.scripts}/generate_constrained_ligand_poses.py" --input-sdf "${params.congenericSeries}" --prepped-schema "${prepped_complex_json_schema}" + plumbline generate-constrained-ligand-poses \ + --input-sdf "${params.congenericSeries}" \ + --prepped-schema "${prepped_complex_json_schema}" \ + --output-directory "./" """ } @@ -127,10 +142,10 @@ process MAKE_FEC_INPUTS { asap-cli alchemy create fecs-workflow.json asap-cli alchemy plan \ - -f fecs-workflow.json \ - --name ${uuid}_plumb_alchemiscale_network \ - --receptor "${prepped_complex}" \ - --ligands "${posed_ligands}" \ + -f fecs-workflow.json \ + --name ${uuid}_plumb_alchemiscale_network \ + --receptor "${prepped_complex}" \ + --ligands "${posed_ligands}" \ """ } process VISUALIZE_NETWORK { @@ -147,6 +162,8 @@ process VISUALIZE_NETWORK { script: """ - python "${params.scripts}/visualize_network.py" --network-graphml "${network_graph}" + plumbline visualize-network \ + --network-graphml "${network_graph}" \ + --output-directory "./" """ -} \ No newline at end of file +} diff --git a/pkg/README.md b/pkg/README.md new file mode 100644 index 0000000..107f408 --- /dev/null +++ b/pkg/README.md @@ -0,0 +1,11 @@ +# plumbdb + +## Installation + +Since `plumbdb` is not yet registered with conda-forge, follow the developer installation instructions at [docs/developer.md](docs/developer.md#installation). + +## Usage + +### Command line + +`plumbdb` is usable via command line through the `plumbline` CLI application. diff --git a/pkg/docs/developer.md b/pkg/docs/developer.md new file mode 100644 index 0000000..29326e9 --- /dev/null +++ b/pkg/docs/developer.md @@ -0,0 +1,19 @@ +# Developer documentation + +## Installation + +`plumbdb` can currently only be installed via its python source with its dependencies handled by `conda`. + +```bash +conda create -n plumbdb_dev -f env.yaml +conda activate plumbdb_dev +pip install -e ".[dev]" +``` + +## Running tests + +Tests are performed with [`pytest`](https://docs.pytest.org/), which is automatically installed as part of the `test` dependency group specified in `pyproject.toml`. With the test environment active and from within `plumb/pkg/`, run: + +```bash +python -m pytest src/ +``` \ No newline at end of file diff --git a/pkg/env.yaml b/pkg/env.yaml new file mode 100644 index 0000000..6aadb94 --- /dev/null +++ b/pkg/env.yaml @@ -0,0 +1,11 @@ +name: plumbdb +channels: + - conda-forge + - openeye + +dependencies: + - python=3.11 + - openeye-toolkits + - asapdiscovery + - openfe + - click diff --git a/pkg/pyproject.toml b/pkg/pyproject.toml new file mode 100644 index 0000000..1c00003 --- /dev/null +++ b/pkg/pyproject.toml @@ -0,0 +1,31 @@ +[build-system] +requires = [ + "setuptools>=77.0.3", + "setuptools-scm>=8", +] +build-backend = "setuptools.build_meta" + +[project] +name = "plumbdb" +description = "" +readme = "README.md" +authors = [ + {name = "Ariana Brenner Clerkin", email = "ariana.clerkin@choderlab.org"} +] +license = "MIT" +classifiers = [ + "Intended Audience :: Science/Research", + "Operating System :: POSIX", + "Programming Language :: Python :: 3", + "Topic :: Scientific/Engineering :: Bio-Informatics", + "Topic :: Scientific/Engineering :: Chemistry", +] +requires-python = ">= 3.11" +dynamic = ["version"] + +[project.scripts] +plumbline = "plumbdb.cli:cli" + +[project.optional-dependencies] +test = ["pytest"] +dev = ["pytest", "ruff"] \ No newline at end of file diff --git a/pkg/src/plumbdb/__init__.py b/pkg/src/plumbdb/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/pkg/src/plumbdb/cli.py b/pkg/src/plumbdb/cli.py new file mode 100644 index 0000000..09bbad3 --- /dev/null +++ b/pkg/src/plumbdb/cli.py @@ -0,0 +1,687 @@ +import json +import pathlib + +import click + + +@click.group() +def cli(): + pass + + +# TODO: input file type can be better defined +@cli.command(name="download-pdb", help="Download PDB files from the PDB database.") +@click.option("-i", "--input-file", type=str, required=True) +@click.option( + "output_directory", + "-o", + "--output-directory", + type=click.Path(file_okay=False, dir_okay=True, path_type=pathlib.Path), + required=True, +) +def download_pdb_structure(input_file, output_directory): + from asapdiscovery.data.services.rcsb.rcsb_download import download_pdb_structure + + output_directory.mkdir(exist_ok=True, parents=True) + with open(input_file, "r") as f: + record_dict = json.load(f) + downloaded = download_pdb_structure( + record_dict["pdb_id"], output_directory, file_format="cif1" + ) + record_dict["cif"] = downloaded + with open(output_directory / "record.json", "w") as f: + json.dump(record_dict, f) + + +@cli.command( + "assess-prepped-protein", + help="Assess the quality of the prepped protein structure.", +) +@click.option( + "output_directory", + "-o", + "--output-directory", + type=click.Path(file_okay=False, dir_okay=True, path_type=pathlib.Path), + required=True, +) +@click.option("input_openeye_du", "-i", "--input-file", type=str, required=True) +def assess_prepped_protein(output_directory, input_openeye_du): + try: + from openeye import oespruce + except ModuleNotFoundError: + raise click.UsageError("Could not import openeye") + + from asapdiscovery.data.backend.openeye import load_openeye_design_unit + + output_directory.mkdir(exist_ok=True, parents=True) + + du = load_openeye_design_unit(input_openeye_du) + + # should use a different id method + stem = pathlib.Path(input_openeye_du).stem + validator = oespruce.OEValidateDesignUnit() + err_msgs = validator.GetMessages(validator.Validate(du)) + sq = du.GetStructureQuality() + + report = { + "errors": err_msgs, + "warnings": [], + "has_iridium_data": sq.HasIridiumData(), + } + + with open(output_directory / f"{stem}_quality_report.json", "w") as f: + json.dump(report, f, indent=4) + +# Prep for docking +from typing import TYPE_CHECKING, Optional +# copying everything +# TODO delete everything from here that isn't needed + +def postera(func): + return click.option( + "--postera", + is_flag=True, + default=False, + help="Whether to download complexes from Postera.", + )(func) + + +def postera_molset_name(func): + return click.option( + "--postera-molset-name", + type=str, + default=None, + help="The name of the Postera molecule set to use.", + )(func) + + +def postera_upload(func): + return click.option( + "--postera-upload", + is_flag=True, + default=False, + help="Whether to upload results to Postera.", + )(func) + + +def postera_args(func): + return postera(postera_molset_name(postera_upload(func))) + + +def use_dask(func): + return click.option( + "--use-dask", + is_flag=True, + default=False, + help="Whether to use dask for parallelism.", + )(func) + + +def dask_type(func): + return click.option( + "--dask-type", + type=click.Choice(DaskType.get_values(), case_sensitive=False), + default=DaskType.LOCAL, + help="The type of dask cluster to use. Local mode is reccommended for most use cases.", + )(func) + + +def failure_mode(func): + return click.option( + "--failure-mode", + type=click.Choice(FailureMode.get_values(), case_sensitive=False), + default=FailureMode.SKIP, + help="The failure mode for dask. Can be 'raise' or 'skip'.", + show_default=True, + )(func) + + +def dask_n_workers(func): + return click.option( + "--dask-n-workers", + type=int, + default=None, + help="The number of workers to use with dask.", + )(func) + + +def dask_args(func): + return use_dask(dask_type(dask_n_workers(failure_mode(func)))) + + +def target(func): + from asapdiscovery.data.services.postera.manifold_data_validation import TargetTags + return click.option( + "--target", + type=click.Choice(TargetTags.get_values(), case_sensitive=True), + help="The target for the workflow", + required=True, + )(func) + + +def ligands(func): + return click.option( + "-l", + "--ligands", + type=click.Path(resolve_path=True, exists=True, file_okay=True, dir_okay=False), + help="File containing ligands", + )(func) + + +def output_dir(func): + return click.option( + "--output-dir", + type=click.Path( + resolve_path=True, exists=False, file_okay=False, dir_okay=True + ), + help="The directory to output results to.", + default="output", + )(func) + + +def overwrite(func): + return click.option( + "--overwrite/--no-overwrite", + default=True, + help="Whether to overwrite the output directory if it exists.", + )(func) + + +def input_json(func): + return click.option( + "--input-json", + type=click.Path(resolve_path=True, exists=True, file_okay=True, dir_okay=False), + help="Path to a json file containing the inputs to the workflow, WARNING: overrides all other inputs.", + )(func) + +# flag to run all ml scorers +def ml_score(func): + return click.option( + "--ml-score", + is_flag=True, + default=True, + help="Whether to run all ml scorers", + )(func) + + +def fragalysis_dir(func): + return click.option( + "--fragalysis-dir", + type=click.Path(resolve_path=True, exists=True, file_okay=False, dir_okay=True), + help="Path to a directory containing fragments to dock.", + )(func) + + +def structure_dir(func): + return click.option( + "--structure-dir", + type=click.Path(resolve_path=True, exists=True, file_okay=False, dir_okay=True), + help="Path to a directory containing structures.", + )(func) + + +def pdb_file(func): + return click.option( + "--pdb-file", + type=click.Path(resolve_path=True, exists=True, file_okay=True, dir_okay=False), + help="Path to a pdb file containing a structure", + )(func) + + +def cache_dir(func): + return click.option( + "--cache-dir", + type=click.Path( + resolve_path=True, exists=False, file_okay=False, dir_okay=True + ), + help="Path to a directory where design units are cached.", + )(func) + + +def use_only_cache(func): + return click.option( + "--use-only-cache", + is_flag=True, + default=False, + help="Whether to only use the cache.", + )(func) + + +def gen_cache_w_default(func): + return click.option( + "--gen-cache", + type=click.Path( + resolve_path=False, exists=False, file_okay=False, dir_okay=True + ), + help="Path to a directory where a design unit cache should be generated.", + default="prepped_structure_cache", + )(func) + + +def md(func): + return click.option( + "--md", + is_flag=True, + default=False, + help="Whether to run MD", + )(func) + + +def md_steps(func): + return click.option( + "--md-steps", + type=int, + default=2500000, + help="Number of MD steps", + )(func) + +def core_smarts(func): + return click.option( + "-cs", + "--core-smarts", + type=click.STRING, + help="The SMARTS which should be used to select which atoms to constrain to the reference structure.", + )(func) + + +def save_to_cache(func): + return click.option( + "--save-to-cache/--no-save-to-cache", + help="If the newly generated structures should be saved to the cache folder.", + default=True, + )(func) + + +def loglevel(func): + return click.option( + "--loglevel", + type=click.Choice(["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"]), + help="The log level to use.", + default="INFO", + show_default=True, + )(func) + + +def ref_chain(func): + return click.option( + "--ref-chain", + type=str, + default=None, + help="Chain ID to align to in reference structure containing the active site.", + )(func) + + +def active_site_chain(func): + return click.option( + "--active-site-chain", + type=str, + default=None, + help="Active site chain ID to align to ref_chain in reference structure", + )(func) + +from asapdiscovery.data.util.dask_utils import DaskType, FailureMode + +if TYPE_CHECKING: + from asapdiscovery.data.services.postera.manifold_data_validation import TargetTags + + +@cli.command( + "prep-protein-for-docking", + help="Prep protein to make OE Design Units and corresponding schema.", +) +@target +@click.option( + "--align", + type=click.Path(resolve_path=True, exists=True, file_okay=True, dir_okay=False), + help="Path to a reference structure to align to", +) +@ref_chain +@active_site_chain +@click.option( + "--seqres-yaml", + type=click.Path(resolve_path=True, exists=True, file_okay=True, dir_okay=False), + help="Path to a seqres yaml file to mutate to, if not specified will use the default for the target", +) +@click.option( + "--loop-db", + type=click.Path(resolve_path=True, exists=True, file_okay=True, dir_okay=False), + help="Path to a loop database to use for prepping", +) +@click.option( + "--oe-active-site-residue", + type=str, + help="OE formatted string of active site residue to use if not ligand bound", +) +@pdb_file +@fragalysis_dir +@structure_dir +@click.option( + "--cache-dir", + help="The path to cached prepared complexes which can be used again.", + type=click.Path(resolve_path=True, exists=True, file_okay=False, dir_okay=True), +) +@save_to_cache +@dask_args +@output_dir +@input_json +def protein_prep( + target: "TargetTags", + align: Optional[str] = None, + ref_chain: Optional[str] = None, + active_site_chain: Optional[str] = None, + seqres_yaml: Optional[str] = None, + loop_db: Optional[str] = None, + oe_active_site_residue: Optional[str] = None, + pdb_file: Optional[str] = None, + fragalysis_dir: Optional[str] = None, + structure_dir: Optional[str] = None, + cache_dir: Optional[str] = None, + save_to_cache: bool = True, + use_dask: bool = False, + dask_type: DaskType = DaskType.LOCAL, + dask_n_workers: Optional[int] = None, + failure_mode: FailureMode = FailureMode.SKIP, + output_dir: str = "output", + input_json: Optional[str] = None, +): + """ + Run protein prep on a set of structures. + """ + from asapdiscovery.workflows.prep_workflows.protein_prep import ( + ProteinPrepInputs, + protein_prep_workflow, + ) + + if input_json is not None: + print("Loading inputs from json file... Will override all other inputs.") + inputs = ProteinPrepInputs.from_json_file(input_json) + + else: + inputs = ProteinPrepInputs( + target=target, + align=align, + ref_chain=ref_chain, + active_site_chain=active_site_chain, + seqres_yaml=seqres_yaml, + loop_db=loop_db, + oe_active_site_residue=oe_active_site_residue, + pdb_file=pdb_file, + fragalysis_dir=fragalysis_dir, + structure_dir=structure_dir, + cache_dir=cache_dir, + save_to_cache=save_to_cache, + use_dask=use_dask, + dask_type=dask_type, + dask_n_workers=dask_n_workers, + failure_mode=failure_mode, + output_dir=output_dir, + ) + + protein_prep_workflow(inputs) + +# TODO: check for openeye installation, maybe make it a decorator +@cli.command( + "generate-constrained-ligand-poses", + help="Generate constrained ligand poses using OpenEye tooling.", +) +@click.option("input_sdf", "-i", "--input-sdf", required=True, type=str) +@click.option("prepped_schema", "-s", "--prepped-schema", type=str) +@click.option( + "output_directory", + "-o", + "--output-directory", + type=click.Path(file_okay=False, dir_okay=True, path_type=pathlib.Path), + required=True, +) +def generate_constrained_ligand_poses(input_sdf, prepped_schema, output_directory): + try: + from asapdiscovery.data.schema.complex import PreppedComplex + from asapdiscovery.data.readers.molfile import MolFileFactory + from asapdiscovery.data.schema.ligand import Ligand + from asapdiscovery.docking.schema.pose_generation import ( + OpenEyeConstrainedPoseGenerator, + ) + from asapdiscovery.data.backend.openeye import save_openeye_sdfs + except ModuleNotFoundError: + raise click.UsageError("Could not import openeye") + + raw_ligands = MolFileFactory(filename=input_sdf).load() + + # reconstruct ligands from smiles because of some weirdness with the sdf files + ligs = [ + Ligand.from_smiles( + compound_name=lig.tags["BindingDB MonomerID"], # Changed capitalization + smiles=lig.smiles, + tags=lig.dict(), + ) + for lig in raw_ligands + ] + + prepped_complex = PreppedComplex.parse_file(prepped_schema) + + poser = OpenEyeConstrainedPoseGenerator() + poses = poser.generate_poses(prepped_complex, ligands=ligs) + oemols = [ligand.to_oemol() for ligand in poses.posed_ligands] + # save to sdf file + save_openeye_sdfs(oemols, output_directory / "poses.sdf") + + +@cli.command("prep-cif") +@click.option("input_json", "-j", "--input-json", type=str, required=True) +@click.option("input_cif", "-c", "--input-cif", type=str, required=True) +@click.option( + "fasta_sequence", "-f", "--fasta-sequence", type=str, default=None, required=True +) +@click.option("loop_db", "--loopdb", type=str, required=True) +@click.option( + "output_directory", + "-o", + "--output-directory", + type=click.Path(file_okay=False, dir_okay=True, path_type=pathlib.Path), + default="./", + required=True, +) +def prep_cif(input_json, input_cif, fasta_sequence, loop_db, output_directory): + from asapdiscovery.data.backend.openeye import load_openeye_cif1 + from asapdiscovery.modeling.modeling import split_openeye_mol + from asapdiscovery.data.schema.ligand import Ligand + from asapdiscovery.data.schema.complex import Complex + from plumbdb.oespruce import spruce_protein + from asapdiscovery.data.schema.target import Target + + output_directory.mkdir(exist_ok=True, parents=True) + + with open(input_json, "r") as f: + record_dict = json.load(f) + + graphmol = load_openeye_cif1(input_cif) + + # this is what you would do if you didn't want to use whatever ligand is in the protein + # split_dict = split_openeye_mol(graphmol, keep_one_lig=False) + split_dict = split_openeye_mol(graphmol, keep_one_lig=True) + + # # Save initial protein as pdb file + target = Target.from_oemol( + split_dict["prot"], + target_name=record_dict["pdb_id"], + ) + # target.to_pdb("protein.pdb") + + # this is what you would do if you didn't want to use whatever ligand is in the protein + ligand = Ligand.from_oemol(split_dict["lig"]) + # ligand = Ligand.from_sdf(f'{record_dict["compound_name"]}.sdf') + + combined_complex = Complex(target=target, ligand=ligand, ligand_chain="L") + + oemol = combined_complex.to_combined_oemol() + + results, spruced = spruce_protein( + initial_prot=oemol, + protein_sequence=fasta_sequence, + loop_db=loop_db, + ) + + split_dict = split_openeye_mol(spruced) + prepped_target = Target.from_oemol(split_dict["prot"], **target.dict()) + prepped_ligand = Ligand.from_oemol(split_dict["lig"], **ligand.dict()) + prepped_ligand.to_sdf(output_directory / f"{ligand.compound_name}_ligand.sdf") + prepped_target.to_pdb(output_directory / f"{target.target_name}_spruced.pdb") + + prepped_complex = Complex( + target=prepped_target, ligand=prepped_ligand, ligand_chain="L" + ) + + filename = f"{target.target_name}_{ligand.compound_name}_spruced_complex.pdb" + prepped_complex.to_pdb(output_directory / filename) + results.to_json_file(output_directory / f"{target.target_name}_spruce_results.json") + + +# TODO: help string +@cli.command( + "process-bindingdb", help="Parse and verify SDF files downloaded from bindingdb." +) +@click.option( + "input_directory", + "-i", + "--input-directory", + type=click.Path(file_okay=False, dir_okay=True, path_type=pathlib.Path), + required=True, + help="SDF file to process", +) +@click.option( + "output_directory", + "-o", + "--output-directory", + type=click.Path(file_okay=False, dir_okay=True, path_type=pathlib.Path), + required=True, + help="Directory to write output files", +) +def process_bindingdb(input_directory, output_directory): + from asapdiscovery.data.schema.ligand import Ligand + from asapdiscovery.data.readers.molfile import MolFileFactory + import pandas as pd + import math + + output_directory.mkdir(exist_ok=True, parents=True) + + # get all sdf files + sdfs = list(input_directory.glob("*3D.sdf")) + + output = [] + for sdf in sdfs: + # asap function to read separate ligands from a multi-ligand sdf file + mols: list[Ligand] = MolFileFactory(filename=sdf).load() + + # create a dictionary for each ligand containing various relevant information + # there are some hidden choices here, for instance OpenEye is adding hydrogens which you might not want + + for mol in mols: + mol_dict = { + "compound_name": mol.compound_name, + "filename": sdf.name, + "has_3d": mol.to_oemol().GetDimension() == 3, + "num_atoms": mol.to_oemol().NumAtoms(), + "smiles": mol.smiles, + # "pdb_id": mol.tags.get("PDB ID")[:4] # removed trailing space + # if mol.tags.get("PDB ID") # removed trailing space + "pdb_id": mol.tags.get("PDB ID(s) for Ligand-Target Complex")[:4] # removed trailing space + if mol.tags.get("PDB ID(s) for Ligand-Target Complex") + else "", + } + + # any data in the SDF file is saved to the 'tags' attribute of an asapdiscovery Ligand object + mol_dict.update(mol.tags) + + # write out sdf file + if mol_dict["has_3d"]: + mol.to_sdf(output_directory / f"{mol.compound_name}.sdf") + + output.append(mol_dict) + + df = pd.DataFrame.from_records(output) + df.to_csv(output_directory / "processed_bindingdb.csv", index=False) + + # write separate csvs for 2D and 3D + df_2d = df[~df["has_3d"]] + df_3d = df[df["has_3d"]] + df_2d.to_csv(output_directory / "2d_bindingdb.csv", index=False) + df_3d.to_csv(output_directory / "3d_bindingdb.csv", index=False) + + unique_sdf_filenames = df_3d["filename"].unique() + with open(output_directory / "unique_3D_sdf_filenames.txt", "w") as f: + for filename in unique_sdf_filenames: + f.write(f"{filename}\n") + + # write out separate json records + for record in df_3d.to_dict(orient="records"): + with open(output_directory / f"{record['compound_name']}.json", "w") as f: + f.write( + json.dumps( + record, indent=4, default=lambda x: None if math.isnan(x) else x + ) + ) + +@cli.command("rename-ligands-for-openfe") +@click.option("-i", + "--input-sdf", + type=click.Path(file_okay=True, dir_okay=False, path_type=pathlib.Path),) +@click.option( + "-o", + "--output-dir", + type=click.Path(file_okay=False, dir_okay=True, path_type=pathlib.Path), + required=True, + default=pathlib.Path("./"), + help="Path to the output directory where the results will be stored", +) +def rename_ligands_for_sdf(input_sdf, output_dir): + from asapdiscovery.data.schema.ligand import Ligand + from asapdiscovery.data.readers.molfile import MolFileFactory + from asapdiscovery.data.schema.ligand import write_ligands_to_multi_sdf + mols: list[Ligand] = MolFileFactory(filename=input_sdf).load() + new_ligands = [] + for mol in mols: + oemol = mol.to_oemol() + oemol.SetTitle(mol.compound_name) + new_ligands.append(Ligand.from_oemol(oemol)) + output_sdf = output_dir / "renamed.sdf" + write_ligands_to_multi_sdf(output_sdf, new_ligands, overwrite=True) + +@cli.command( + "visualize-network", + help="Generate a network plot of the proposed alchemical network", +) +@click.option( + "network_graphml", + "-n", + "--network-graphml", + type=str, + required=True, + help="Path to the input JSON file containing the atom mapping network.", +) +@click.option( + "output_directory", + "-o", + "--output-directory", + type=click.Path(file_okay=False, dir_okay=True, path_type=pathlib.Path), + required=True, + default=pathlib.Path("./"), + help="Path to the output directory where the results will be stored", +) +def visualize_network(network_graphml, output_directory): + from openfe.utils.atommapping_network_plotting import plot_atommapping_network + from openfe.setup import LigandNetwork + + output_directory.mkdir(exist_ok=True, parents=True) + ligand_network = network_graphml + + if not ligand_network.exists(): + raise FileNotFoundError( + f"Could not find the ligand network file at {ligand_network}" + ) + + with open(ligand_network) as f: + graphml = f.read() + + network = LigandNetwork.from_graphml(graphml) + fig = plot_atommapping_network(network) + fig.savefig(output_directory / "network_plot.png") diff --git a/pkg/src/plumbdb/oespruce.py b/pkg/src/plumbdb/oespruce.py new file mode 100644 index 0000000..7f986fe --- /dev/null +++ b/pkg/src/plumbdb/oespruce.py @@ -0,0 +1,173 @@ +import pathlib + +from asapdiscovery.data.schema.schema_base import DataModelAbstractBase +from asapdiscovery.data.backend.openeye import ( + oespruce, + oechem, + oegrid, +) +from asapdiscovery.modeling.modeling import ( + get_oe_structure_metadata_from_sequence, + openeye_perceive_residues, +) + + +class SpruceResults(DataModelAbstractBase): + build_loops_success: bool + build_sidechains_success: bool + add_caps_success: bool = None + place_hydrogens_success: bool + error_message: str + + +# TODO: this was originally defined in asapdiscovery.modeling.modeling? +def spruce_protein( + initial_prot: oechem.OEGraphMol, + protein_sequence: str = None, + loop_db: pathlib.Path = None, +) -> oechem.OEDesignUnit or oechem.OEGraphMol: + """ + Applies the OESpruce protein preparation pipeline to the given protein structure. + + Parameters + ---------- + initial_prot : oechem.OEMol + The input protein structure to be prepared. + + protein_sequence : str, optional + The sequence of the protein for a single chain. If provided, this will be added to the Structure Metadata before applying the OESpruce pipeline. + Default is None. + + loop_db : str, optional + The filename of the loop database to be used by the OESpruce pipeline. If provided, the pipeline will include the loop building step. + Default is None. + + Returns + ------- + (success: bool, spruce_error_msg: str, initial_prot: oechem.OEMol) + Returns a tuple of: + a boolean for whether sprucing was successful + a string of the error message if sprucing failed + the prepared protein structure. + """ + + # Add Hs to prep protein and ligand + # oechem.OEAddExplicitHydrogens(initial_prot) + + # Even though we aren't making DUs, we still need to set up the options + opts = oespruce.OEMakeDesignUnitOptions() + opts.SetSuperpose(False) + opts.GetPrepOptions().SetStrictProtonationMode(True) + + # Add caps when needed + # Allow truncation in case adding a cap causes a clash + cap_opts = oespruce.OECapBuilderOptions() + cap_opts.SetAllowTruncate(True) + is_terminal_predicate = oechem.OEOrAtom( + oechem.OEIsNTerminalAtom(), oechem.OEIsCTerminalAtom() + ) + + # Set Build Loop and Sidechain Opts + sc_opts = oespruce.OESidechainBuilderOptions() + + loop_opts = oespruce.OELoopBuilderOptions() + loop_opts.SetSeqAlignMethod(oechem.OESeqAlignmentMethod_Identity) + loop_opts.SetSeqAlignGapPenalty(-1) + loop_opts.SetSeqAlignExtendPenalty(0) + + # Don't build tails, too much work for little gain + loop_opts.SetBuildTails(False) + + if loop_db is not None: + print(f"Adding loop db {loop_db}") + loop_opts.SetLoopDBFilename(str(loop_db)) + + # Construct spruce filter + spruce_opts = oespruce.OESpruceFilterOptions() + spruce = oespruce.OESpruceFilter(spruce_opts, opts) + + # Spruce! + + # These objects are for some reason needed in order to run spruce + grid = oegrid.OESkewGrid() + + if protein_sequence: + # convert fasta sequence to 3-letter codes + try: + protein_sequence = " ".join(convert_to_three_letter_codes(protein_sequence)) + print(type(protein_sequence)) + print("Adding sequence metadata from sequence: ", protein_sequence) + metadata = get_oe_structure_metadata_from_sequence( + initial_prot, protein_sequence + ) + except KeyError as e: + print( + f"Error converting protein sequence to 3-letter codes: {e}. Skipping sequence metadata." + ) + protein_sequence = None + + if not protein_sequence: + metadata = oespruce.OEStructureMetadata() + + # Building the loops actually does use the sequence metadata + build_loops_success = oespruce.OEBuildLoops( + initial_prot, metadata, sc_opts, loop_opts + ) + + build_sidechains_success = oespruce.OEBuildSidechains(initial_prot, sc_opts) + print(type(initial_prot), type(is_terminal_predicate), type(cap_opts)) + add_caps_success = oespruce.OECapTermini( + initial_prot, is_terminal_predicate, cap_opts + ) + print(add_caps_success) + place_hydrogens_success = oechem.OEPlaceHydrogens(initial_prot) + spruce_error_code = spruce.StandardizeAndFilter(initial_prot, grid, metadata) + spruce_error_msg = spruce.GetMessages(spruce_error_code) + + # Re-percieve residues so that atom number and connect records dont get screwed up + initial_prot = openeye_perceive_residues(initial_prot, preserve_all=False) + return ( + SpruceResults( + build_loops_success=build_loops_success, + build_sidechains_success=build_sidechains_success, + # add_caps_success=None, # add_caps_success, + place_hydrogens_success=place_hydrogens_success, + error_message=spruce_error_msg, + ), + initial_prot, + ) + + +def convert_to_three_letter_codes(sequence) -> list[str]: + """ + Convert a protein sequence from 1-letter codes to 3-letter codes. + """ + # Dictionary to map 1-letter codes to 3-letter codes + amino_acid_dict = { + "A": "ALA", + "R": "ARG", + "N": "ASN", + "D": "ASP", + "C": "CYS", + "Q": "GLN", + "E": "GLU", + "G": "GLY", + "H": "HIS", + "I": "ILE", + "L": "LEU", + "K": "LYS", + "M": "MET", + "F": "PHE", + "P": "PRO", + "S": "SER", + "T": "THR", + "W": "TRP", + "Y": "TYR", + "V": "VAL", + } + + # Convert the sequence to 3-letter codes + three_letter_sequence = [amino_acid_dict[aa] for aa in sequence] + + # Join the 3-letter codes with a space + return three_letter_sequence diff --git a/pkg/src/plumbdb/tests/test_plumb.py b/pkg/src/plumbdb/tests/test_plumb.py new file mode 100644 index 0000000..e59fd58 --- /dev/null +++ b/pkg/src/plumbdb/tests/test_plumb.py @@ -0,0 +1,2 @@ +def test_import(): + import plumbdb