From 7fcac8288791084b196d8b972658f18022799f6d Mon Sep 17 00:00:00 2001 From: delalamo Date: Mon, 12 Jan 2026 09:08:33 -0500 Subject: [PATCH 01/22] Add Docker support with GitHub Container Registry publishing - Add Dockerfile using condaforge/miniforge3 for conda dependencies - Add docker-build.yml workflow triggered by releases, tags, or manual dispatch - Add .dockerignore to exclude build artifacts - Update README with Docker usage instructions Image published to ghcr.io/delalamo/graphrelax Co-Authored-By: Claude Opus 4.5 --- .dockerignore | 14 ++++++ .github/workflows/docker-build.yml | 70 ++++++++++++++++++++++++++++++ Dockerfile | 38 ++++++++++++++++ README.md | 24 ++++++++++ 4 files changed, 146 insertions(+) create mode 100644 .dockerignore create mode 100644 .github/workflows/docker-build.yml create mode 100644 Dockerfile diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..d308aa8 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,14 @@ +.git +__pycache__ +*.py[cod] +*.egg-info/ +dist/ +build/ +.pytest_cache/ +.coverage +.venv/ +venv/ +.idea/ +.vscode/ +src/graphrelax/LigandMPNN/model_params/*.pt +.DS_Store diff --git a/.github/workflows/docker-build.yml b/.github/workflows/docker-build.yml new file mode 100644 index 0000000..f4006cb --- /dev/null +++ b/.github/workflows/docker-build.yml @@ -0,0 +1,70 @@ +name: Docker Build + +on: + release: + types: [published] + push: + tags: + - "v*" + workflow_dispatch: + inputs: + tag: + description: "Image tag (optional, defaults to 'dev')" + required: false + default: "dev" + +env: + REGISTRY: ghcr.io + IMAGE_NAME: ${{ github.repository }} + +jobs: + build-and-push: + runs-on: ubuntu-latest + permissions: + contents: read + packages: write + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Compute image tags + id: tags + run: | + REPO_LOWER=$(echo "${{ github.repository }}" | tr '[:upper:]' '[:lower:]') + if [[ "${{ github.event_name }}" == "release" ]]; then + VERSION="${{ github.event.release.tag_name }}" + VERSION="${VERSION#v}" + echo "tags=${{ env.REGISTRY }}/${REPO_LOWER}:${VERSION},${{ env.REGISTRY }}/${REPO_LOWER}:latest" >> $GITHUB_OUTPUT + elif [[ "${{ github.ref }}" == refs/tags/v* ]]; then + VERSION="${GITHUB_REF#refs/tags/v}" + echo "tags=${{ env.REGISTRY }}/${REPO_LOWER}:${VERSION}" >> $GITHUB_OUTPUT + else + TAG="${{ github.event.inputs.tag || 'dev' }}" + echo "tags=${{ env.REGISTRY }}/${REPO_LOWER}:${TAG}" >> $GITHUB_OUTPUT + fi + + - name: Log in to Container Registry + uses: docker/login-action@v3 + with: + registry: ${{ env.REGISTRY }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Build and push Docker image + uses: docker/build-push-action@v5 + with: + context: . + push: true + tags: ${{ steps.tags.outputs.tags }} + cache-from: type=gha + cache-to: type=gha,mode=max + + - name: Verify image + run: | + REPO_LOWER=$(echo "${{ github.repository }}" | tr '[:upper:]' '[:lower:]') + TAG="${{ github.event.inputs.tag || 'dev' }}" + docker run --rm ${{ env.REGISTRY }}/${REPO_LOWER}:${TAG} --help diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..5d60f2d --- /dev/null +++ b/Dockerfile @@ -0,0 +1,38 @@ +# GraphRelax Docker Image +# Combines LigandMPNN sequence design with OpenMM AMBER relaxation +# +# Build: docker build -t graphrelax . +# Run: docker run --rm -v $(pwd):/data graphrelax -i /data/input.pdb -o /data/output.pdb + +FROM condaforge/miniforge3:latest + +LABEL org.opencontainers.image.source="https://github.com/delalamo/GraphRelax" +LABEL org.opencontainers.image.description="GraphRelax: LigandMPNN + OpenMM AMBER relaxation" +LABEL org.opencontainers.image.licenses="MIT" + +WORKDIR /app + +# Install system dependencies +RUN apt-get update && apt-get install -y --no-install-recommends curl \ + && rm -rf /var/lib/apt/lists/* + +# Copy package files +COPY pyproject.toml README.md LICENSE ./ +COPY src/ ./src/ +COPY scripts/ ./scripts/ + +# Install conda dependencies and package +# Note: openmm and pdbfixer are only available on conda-forge, not PyPI +RUN mamba install -y -c conda-forge python=3.11 openmm pdbfixer pytorch-cpu \ + && mamba clean -afy \ + && pip install --no-cache-dir -e . + +# Download model weights (~40MB total) +RUN ./scripts/download_weights.sh + +# Create non-root user for security +RUN useradd -m graphrelax && chown -R graphrelax:graphrelax /app +USER graphrelax + +ENTRYPOINT ["graphrelax"] +CMD ["--help"] diff --git a/README.md b/README.md index 455bc8a..841d777 100644 --- a/README.md +++ b/README.md @@ -40,6 +40,30 @@ pip install -e ".[cuda11]" pip install -e ".[cuda12]" ``` +### Docker + +Run GraphRelax using Docker without installing dependencies: + +```bash +# Pull the image +docker pull ghcr.io/delalamo/graphrelax:latest + +# Run with input/output files +docker run --rm -v $(pwd):/data ghcr.io/delalamo/graphrelax:latest \ + -i /data/input.pdb -o /data/output.pdb + +# Design mode with 5 outputs +docker run --rm -v $(pwd):/data ghcr.io/delalamo/graphrelax:latest \ + -i /data/input.pdb -o /data/designed.pdb --design -n 5 +``` + +Build locally: + +```bash +docker build -t graphrelax . +docker run --rm graphrelax --help +``` + ### Dependencies Core dependencies (installed automatically via pip): From 73616967cfbff0c1604e16e1c4f3e482b68a9f56 Mon Sep 17 00:00:00 2001 From: delalamo Date: Mon, 12 Jan 2026 09:10:40 -0500 Subject: [PATCH 02/22] Adding .DS_Store to .gitignore --- .gitignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitignore b/.gitignore index d93d2a4..cc697a2 100644 --- a/.gitignore +++ b/.gitignore @@ -212,3 +212,6 @@ src/graphrelax/LigandMPNN/model_params/*.pt # Auto-generated version file (setuptools-scm) src/graphrelax/_version.py + +# Misc +.DS_Store From 455402d282e4a57e3ae70da4232951d2542da870 Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 12 Jan 2026 14:54:53 +0000 Subject: [PATCH 03/22] Add chain gap detection to prevent artificial gap closure during minimization Detects missing residues in protein chains by checking residue numbering discontinuities and C-N bond distances. Chains are split at gaps before OpenMM minimization to prevent the creation of unrealistic peptide bonds across gaps. Original chain IDs are restored after minimization. - Add chain_gaps.py module with detect_chain_gaps, split_chains_at_gaps, and restore_chain_ids functions - Integrate gap detection into relaxer.py relax() method - Add split_chains_at_gaps config option (enabled by default) - Add --no-split-gaps CLI flag to disable the feature - Add comprehensive tests for chain gap detection --- src/graphrelax/chain_gaps.py | 351 +++++++++++++++++++++++++++++++++++ src/graphrelax/cli.py | 10 + src/graphrelax/config.py | 1 + src/graphrelax/relaxer.py | 35 +++- tests/test_chain_gaps.py | 344 ++++++++++++++++++++++++++++++++++ 5 files changed, 739 insertions(+), 2 deletions(-) create mode 100644 src/graphrelax/chain_gaps.py create mode 100644 tests/test_chain_gaps.py diff --git a/src/graphrelax/chain_gaps.py b/src/graphrelax/chain_gaps.py new file mode 100644 index 0000000..49b15ae --- /dev/null +++ b/src/graphrelax/chain_gaps.py @@ -0,0 +1,351 @@ +"""Detection and handling of chain gaps in protein structures. + +This module provides functions to detect missing residues (gaps) in protein +chains and to split chains at those gaps prior to minimization. This prevents +OpenMM minimization from artificially closing gaps by creating unrealistic +peptide bonds. +""" + +import io +import logging +from dataclasses import dataclass +from typing import Dict, List, Optional, Tuple + +from Bio.PDB import PDBParser + +logger = logging.getLogger(__name__) + +# Maximum C-N peptide bond distance in Angstroms +# Typical C-N bond is ~1.33 A, anything > 2.0 A indicates a break +MAX_PEPTIDE_BOND_DISTANCE = 2.0 + +# Maximum residue number gap that's considered sequential +# Gap > 1 indicates missing residues +MAX_SEQUENTIAL_GAP = 1 + + +@dataclass +class ChainGap: + """Represents a gap in a protein chain.""" + + chain_id: str + residue_before: int # Residue number before the gap + residue_after: int # Residue number after the gap + distance: Optional[float] = None # C-N distance if available + icode_before: str = "" + icode_after: str = "" + + def __str__(self) -> str: + gap_size = self.residue_after - self.residue_before - 1 + dist_str = f", dist={self.distance:.2f}A" if self.distance else "" + return ( + f"Chain {self.chain_id}: gap of {gap_size} residues " + f"between {self.residue_before}{self.icode_before} and " + f"{self.residue_after}{self.icode_after}{dist_str}" + ) + + +def detect_chain_gaps( + pdb_string: str, + check_distance: bool = True, + max_bond_distance: float = MAX_PEPTIDE_BOND_DISTANCE, +) -> List[ChainGap]: + """ + Detect gaps in protein chains. + + Gaps are detected by two methods: + 1. Residue numbering discontinuities (missing residue numbers) + 2. Large C-N distances between consecutive residues (if check_distance=True) + + Args: + pdb_string: PDB file contents as string + check_distance: Whether to also check C-N bond distances + max_bond_distance: Maximum allowed C-N distance in Angstroms + + Returns: + List of ChainGap objects describing each detected gap + """ + parser = PDBParser(QUIET=True) + structure = parser.get_structure("protein", io.StringIO(pdb_string)) + + gaps = [] + + for model in structure: + for chain in model: + residues = [r for r in chain.get_residues() if r.id[0] == " "] + if len(residues) < 2: + continue + + for i in range(len(residues) - 1): + res1 = residues[i] + res2 = residues[i + 1] + + resnum1 = res1.id[1] + resnum2 = res2.id[1] + icode1 = res1.id[2].strip() + icode2 = res2.id[2].strip() + + # Check for numbering gap + numbering_gap = resnum2 - resnum1 > MAX_SEQUENTIAL_GAP + + # Check C-N distance if requested + distance = None + distance_gap = False + if check_distance: + try: + c_atom = res1["C"] + n_atom = res2["N"] + distance = c_atom - n_atom + distance_gap = distance > max_bond_distance + except KeyError: + # Missing backbone atoms - treat as potential gap + distance_gap = True + + if numbering_gap or distance_gap: + gap = ChainGap( + chain_id=chain.id, + residue_before=resnum1, + residue_after=resnum2, + distance=distance, + icode_before=icode1, + icode_after=icode2, + ) + gaps.append(gap) + logger.debug(f"Detected gap: {gap}") + + if gaps: + logger.info(f"Detected {len(gaps)} chain gap(s) in structure") + else: + logger.debug("No chain gaps detected") + + return gaps + + +def split_chains_at_gaps( + pdb_string: str, + gaps: Optional[List[ChainGap]] = None, +) -> Tuple[str, Dict[str, str]]: + """ + Split chains at detected gaps by assigning new chain IDs. + + Each continuous segment gets a unique chain ID. This prevents OpenMM + from creating peptide bonds across gaps during minimization. + + Args: + pdb_string: PDB file contents as string + gaps: Pre-detected gaps (if None, will detect automatically) + + Returns: + Tuple of: + - Modified PDB string with split chains + - Mapping from new chain IDs to original chain IDs + """ + if gaps is None: + gaps = detect_chain_gaps(pdb_string) + + if not gaps: + return pdb_string, {} + + # Build a set of (chain_id, residue_after) for gap locations + # We'll start a new chain at each gap + gap_starts = {(g.chain_id, g.residue_after, g.icode_after) for g in gaps} + + # Available chain IDs (A-Z, a-z, 0-9) + all_chain_ids = ( + list("ABCDEFGHIJKLMNOPQRSTUVWXYZ") + + list("abcdefghijklmnopqrstuvwxyz") + + list("0123456789") + ) + + # Parse to find existing chain IDs + existing_chains = set() + for line in pdb_string.split("\n"): + if line.startswith(("ATOM", "HETATM")) and len(line) > 21: + existing_chains.add(line[21]) + + # Create pool of available new chain IDs + available_ids = [c for c in all_chain_ids if c not in existing_chains] + + # Track chain ID assignments + chain_mapping = {} # new_chain_id -> original_chain_id + current_chain_map = {} # original_chain_id -> current_new_chain_id + next_id_idx = 0 + + # Process PDB line by line + output_lines = [] + current_residue = {} # chain_id -> (resnum, icode) + + for line in pdb_string.split("\n"): + if line.startswith(("ATOM", "HETATM")) and len(line) > 26: + chain_id = line[21] + resnum = int(line[22:26].strip()) + icode = line[26].strip() if len(line) > 26 else "" + + # Check if this residue starts a new segment (at a gap) + if (chain_id, resnum, icode) in gap_starts: + # Assign a new chain ID for this segment + if next_id_idx < len(available_ids): + new_chain_id = available_ids[next_id_idx] + next_id_idx += 1 + current_chain_map[chain_id] = new_chain_id + chain_mapping[new_chain_id] = chain_id + logger.debug( + f"Assigned new chain {new_chain_id} for segment " + f"starting at {chain_id}:{resnum}{icode}" + ) + else: + logger.warning( + "Ran out of available chain IDs for gap splitting" + ) + + # Initialize chain mapping if not seen before + if chain_id not in current_chain_map: + current_chain_map[chain_id] = chain_id + # Original chains map to themselves + if chain_id not in chain_mapping: + chain_mapping[chain_id] = chain_id + + # Replace chain ID in line + new_chain_id = current_chain_map[chain_id] + line = line[:21] + new_chain_id + line[22:] + + current_residue[chain_id] = (resnum, icode) + + elif line.startswith("TER") and len(line) > 21: + # Update TER record chain ID too + chain_id = line[21] + if chain_id in current_chain_map: + new_chain_id = current_chain_map[chain_id] + line = line[:21] + new_chain_id + line[22:] + + output_lines.append(line) + + result = "\n".join(output_lines) + + if chain_mapping: + # Only log non-trivial mappings + non_trivial = {k: v for k, v in chain_mapping.items() if k != v} + if non_trivial: + logger.info( + f"Split chains at {len(gaps)} gap(s), " + f"created {len(non_trivial)} new chain segment(s)" + ) + + return result, chain_mapping + + +def restore_chain_ids( + pdb_string: str, + chain_mapping: Dict[str, str], +) -> str: + """ + Restore original chain IDs after minimization. + + Args: + pdb_string: PDB string with split chains + chain_mapping: Mapping from current chain IDs to original chain IDs + + Returns: + PDB string with original chain IDs restored + """ + if not chain_mapping: + return pdb_string + + # Build reverse mapping (new -> original) + # Note: chain_mapping is new_id -> original_id + output_lines = [] + + for line in pdb_string.split("\n"): + if line.startswith(("ATOM", "HETATM")) and len(line) > 21: + current_chain = line[21] + if current_chain in chain_mapping: + original_chain = chain_mapping[current_chain] + line = line[:21] + original_chain + line[22:] + + elif line.startswith("TER") and len(line) > 21: + current_chain = line[21] + if current_chain in chain_mapping: + original_chain = chain_mapping[current_chain] + line = line[:21] + original_chain + line[22:] + + output_lines.append(line) + + logger.debug("Restored original chain IDs") + return "\n".join(output_lines) + + +def add_ter_records_at_gaps(pdb_string: str, gaps: List[ChainGap]) -> str: + """ + Add TER records at gap locations to signal chain breaks. + + This is an alternative to chain splitting that works with force fields + that recognize TER as chain termination. + + Args: + pdb_string: PDB file contents as string + gaps: List of detected gaps + + Returns: + PDB string with TER records inserted at gap locations + """ + if not gaps: + return pdb_string + + # Build set of (chain_id, resnum, icode) where TER should be inserted + # TER goes after residue_before + ter_locations = { + (g.chain_id, g.residue_before, g.icode_before) for g in gaps + } + + output_lines = [] + prev_chain = None + prev_resnum = None + prev_icode = None + + for line in pdb_string.split("\n"): + if line.startswith(("ATOM", "HETATM")) and len(line) > 26: + chain_id = line[21] + resnum = int(line[22:26].strip()) + icode = line[26].strip() if len(line) > 26 else "" + + # Check if we need to insert TER before this line + # (i.e., previous residue was at a gap location) + if ( + prev_chain is not None + and (prev_chain, prev_resnum, prev_icode) in ter_locations + and (chain_id != prev_chain or resnum != prev_resnum) + ): + # Insert TER record + ter_line = "TER " + output_lines.append(ter_line) + logger.debug( + f"Inserted TER after {prev_chain}:{prev_resnum}{prev_icode}" + ) + + prev_chain = chain_id + prev_resnum = resnum + prev_icode = icode + + output_lines.append(line) + + return "\n".join(output_lines) + + +def get_gap_summary(gaps: List[ChainGap]) -> str: + """ + Generate a human-readable summary of detected gaps. + + Args: + gaps: List of detected gaps + + Returns: + Formatted string describing all gaps + """ + if not gaps: + return "No chain gaps detected" + + lines = [f"Detected {len(gaps)} chain gap(s):"] + for gap in gaps: + lines.append(f" - {gap}") + + return "\n".join(lines) diff --git a/src/graphrelax/cli.py b/src/graphrelax/cli.py index 0d33e02..b2c0d29 100644 --- a/src/graphrelax/cli.py +++ b/src/graphrelax/cli.py @@ -205,6 +205,15 @@ def create_parser() -> argparse.ArgumentParser: metavar="N", help="Max L-BFGS iterations, 0=unlimited (default: 0)", ) + relax_group.add_argument( + "--no-split-gaps", + action="store_true", + help=( + "Disable automatic chain splitting at gaps. " + "By default, chains are split at detected gaps (missing residues) " + "to prevent artificial gap closure during minimization." + ), + ) # Scoring options score_group = parser.add_argument_group("Scoring options") @@ -323,6 +332,7 @@ def main(args=None) -> int: stiffness=opts.stiffness, max_iterations=opts.max_iterations, constrained=opts.constrained_minimization, + split_chains_at_gaps=not opts.no_split_gaps, ) pipeline_config = PipelineConfig( diff --git a/src/graphrelax/config.py b/src/graphrelax/config.py index ed69714..7f55c99 100644 --- a/src/graphrelax/config.py +++ b/src/graphrelax/config.py @@ -41,6 +41,7 @@ class RelaxConfig: stiffness: float = 10.0 # kcal/mol/A^2 max_outer_iterations: int = 3 # Violation-fixing iterations constrained: bool = False # Use constrained (AmberRelaxation) minimization + split_chains_at_gaps: bool = True # Split chains at gaps to prevent closure # GPU is auto-detected and used when available diff --git a/src/graphrelax/relaxer.py b/src/graphrelax/relaxer.py index fefc176..de351a5 100644 --- a/src/graphrelax/relaxer.py +++ b/src/graphrelax/relaxer.py @@ -11,6 +11,12 @@ from openmm import app as openmm_app from openmm import openmm, unit +from graphrelax.chain_gaps import ( + detect_chain_gaps, + get_gap_summary, + restore_chain_ids, + split_chains_at_gaps, +) from graphrelax.config import RelaxConfig # Add vendored LigandMPNN to path for OpenFold imports @@ -54,17 +60,42 @@ def relax(self, pdb_string: str) -> Tuple[str, dict, np.ndarray]: Uses unconstrained minimization by default, or constrained AmberRelaxation if config.constrained is True. + If split_chains_at_gaps is enabled, chains will be split at detected + gaps before minimization to prevent artificial gap closure. + Args: pdb_string: PDB file contents as string Returns: Tuple of (relaxed_pdb_string, debug_info, violations) """ + # Detect and handle chain gaps if configured + chain_mapping = {} + if self.config.split_chains_at_gaps: + gaps = detect_chain_gaps(pdb_string) + if gaps: + logger.info(get_gap_summary(gaps)) + pdb_string, chain_mapping = split_chains_at_gaps( + pdb_string, gaps + ) + if self.config.constrained: prot = protein.from_pdb_string(pdb_string) - return self.relax_protein(prot) + relaxed_pdb, debug_info, violations = self.relax_protein(prot) else: - return self._relax_unconstrained(pdb_string) + relaxed_pdb, debug_info, violations = self._relax_unconstrained( + pdb_string + ) + + # Restore original chain IDs if chains were split + if chain_mapping: + relaxed_pdb = restore_chain_ids(relaxed_pdb, chain_mapping) + debug_info["chains_split"] = True + debug_info["gaps_detected"] = len( + [k for k, v in chain_mapping.items() if k != v] + ) + + return relaxed_pdb, debug_info, violations def relax_pdb_file(self, pdb_path: Path) -> Tuple[str, dict, np.ndarray]: """ diff --git a/tests/test_chain_gaps.py b/tests/test_chain_gaps.py new file mode 100644 index 0000000..53b29d7 --- /dev/null +++ b/tests/test_chain_gaps.py @@ -0,0 +1,344 @@ +"""Tests for graphrelax.chain_gaps module.""" + +import pytest + +from graphrelax.chain_gaps import ( + ChainGap, + add_ter_records_at_gaps, + detect_chain_gaps, + get_gap_summary, + restore_chain_ids, + split_chains_at_gaps, +) + + +@pytest.fixture +def continuous_peptide_pdb(): + """A continuous peptide with no gaps (residues 1-5).""" + # fmt: off + return """ATOM 1 N ALA A 1 0.000 0.000 0.000 1.00 0.00 N +ATOM 2 CA ALA A 1 1.458 0.000 0.000 1.00 0.00 C +ATOM 3 C ALA A 1 2.009 1.420 0.000 1.00 0.00 C +ATOM 4 O ALA A 1 1.246 2.390 0.000 1.00 0.00 O +ATOM 5 N ALA A 2 3.326 1.540 0.000 1.00 0.00 N +ATOM 6 CA ALA A 2 3.941 2.861 0.000 1.00 0.00 C +ATOM 7 C ALA A 2 5.459 2.789 0.000 1.00 0.00 C +ATOM 8 O ALA A 2 6.065 1.719 0.000 1.00 0.00 O +ATOM 9 N ALA A 3 6.063 3.970 0.000 1.00 0.00 N +ATOM 10 CA ALA A 3 7.510 4.096 0.000 1.00 0.00 C +ATOM 11 C ALA A 3 8.061 5.516 0.000 1.00 0.00 C +ATOM 12 O ALA A 3 7.298 6.486 0.000 1.00 0.00 O +ATOM 13 N ALA A 4 9.378 5.636 0.000 1.00 0.00 N +ATOM 14 CA ALA A 4 9.993 6.957 0.000 1.00 0.00 C +ATOM 15 C ALA A 4 11.511 6.885 0.000 1.00 0.00 C +ATOM 16 O ALA A 4 12.117 5.815 0.000 1.00 0.00 O +ATOM 17 N ALA A 5 12.115 8.066 0.000 1.00 0.00 N +ATOM 18 CA ALA A 5 13.562 8.192 0.000 1.00 0.00 C +ATOM 19 C ALA A 5 14.113 9.612 0.000 1.00 0.00 C +ATOM 20 O ALA A 5 13.350 10.582 0.000 1.00 0.00 O +END +""" # noqa: E501 + # fmt: on + + +@pytest.fixture +def gapped_peptide_pdb(): + """A peptide with a gap - residues 1, 2, 5, 6, 7 (missing 3-4).""" + # fmt: off + return """ATOM 1 N ALA A 1 0.000 0.000 0.000 1.00 0.00 N +ATOM 2 CA ALA A 1 1.458 0.000 0.000 1.00 0.00 C +ATOM 3 C ALA A 1 2.009 1.420 0.000 1.00 0.00 C +ATOM 4 O ALA A 1 1.246 2.390 0.000 1.00 0.00 O +ATOM 5 N ALA A 2 3.326 1.540 0.000 1.00 0.00 N +ATOM 6 CA ALA A 2 3.941 2.861 0.000 1.00 0.00 C +ATOM 7 C ALA A 2 5.459 2.789 0.000 1.00 0.00 C +ATOM 8 O ALA A 2 6.065 1.719 0.000 1.00 0.00 O +ATOM 9 N ALA A 5 12.115 8.066 0.000 1.00 0.00 N +ATOM 10 CA ALA A 5 13.562 8.192 0.000 1.00 0.00 C +ATOM 11 C ALA A 5 14.113 9.612 0.000 1.00 0.00 C +ATOM 12 O ALA A 5 13.350 10.582 0.000 1.00 0.00 O +ATOM 13 N ALA A 6 15.115 10.066 0.000 1.00 0.00 N +ATOM 14 CA ALA A 6 16.562 10.192 0.000 1.00 0.00 C +ATOM 15 C ALA A 6 17.113 11.612 0.000 1.00 0.00 C +ATOM 16 O ALA A 6 16.350 12.582 0.000 1.00 0.00 O +ATOM 17 N ALA A 7 18.115 12.066 0.000 1.00 0.00 N +ATOM 18 CA ALA A 7 19.562 12.192 0.000 1.00 0.00 C +ATOM 19 C ALA A 7 20.113 13.612 0.000 1.00 0.00 C +ATOM 20 O ALA A 7 19.350 14.582 0.000 1.00 0.00 O +END +""" # noqa: E501 + # fmt: on + + +@pytest.fixture +def multi_gap_pdb(): + """A peptide with multiple gaps - residues 1, 5, 10.""" + # fmt: off + return """ATOM 1 N ALA A 1 0.000 0.000 0.000 1.00 0.00 N +ATOM 2 CA ALA A 1 1.458 0.000 0.000 1.00 0.00 C +ATOM 3 C ALA A 1 2.009 1.420 0.000 1.00 0.00 C +ATOM 4 O ALA A 1 1.246 2.390 0.000 1.00 0.00 O +ATOM 5 N ALA A 5 12.115 8.066 0.000 1.00 0.00 N +ATOM 6 CA ALA A 5 13.562 8.192 0.000 1.00 0.00 C +ATOM 7 C ALA A 5 14.113 9.612 0.000 1.00 0.00 C +ATOM 8 O ALA A 5 13.350 10.582 0.000 1.00 0.00 O +ATOM 9 N ALA A 10 22.115 18.066 0.000 1.00 0.00 N +ATOM 10 CA ALA A 10 23.562 18.192 0.000 1.00 0.00 C +ATOM 11 C ALA A 10 24.113 19.612 0.000 1.00 0.00 C +ATOM 12 O ALA A 10 23.350 20.582 0.000 1.00 0.00 O +END +""" # noqa: E501 + # fmt: on + + +@pytest.fixture +def multi_chain_gapped_pdb(): + """A multi-chain structure with a gap in chain A and continuous chain B.""" + # fmt: off + return """ATOM 1 N ALA A 1 0.000 0.000 0.000 1.00 0.00 N +ATOM 2 CA ALA A 1 1.458 0.000 0.000 1.00 0.00 C +ATOM 3 C ALA A 1 2.009 1.420 0.000 1.00 0.00 C +ATOM 4 O ALA A 1 1.246 2.390 0.000 1.00 0.00 O +ATOM 5 N ALA A 5 12.115 8.066 0.000 1.00 0.00 N +ATOM 6 CA ALA A 5 13.562 8.192 0.000 1.00 0.00 C +ATOM 7 C ALA A 5 14.113 9.612 0.000 1.00 0.00 C +ATOM 8 O ALA A 5 13.350 10.582 0.000 1.00 0.00 O +TER +ATOM 9 N ALA B 1 30.000 0.000 0.000 1.00 0.00 N +ATOM 10 CA ALA B 1 31.458 0.000 0.000 1.00 0.00 C +ATOM 11 C ALA B 1 32.009 1.420 0.000 1.00 0.00 C +ATOM 12 O ALA B 1 31.246 2.390 0.000 1.00 0.00 O +ATOM 13 N ALA B 2 33.326 1.540 0.000 1.00 0.00 N +ATOM 14 CA ALA B 2 33.941 2.861 0.000 1.00 0.00 C +ATOM 15 C ALA B 2 35.459 2.789 0.000 1.00 0.00 C +ATOM 16 O ALA B 2 36.065 1.719 0.000 1.00 0.00 O +END +""" # noqa: E501 + # fmt: on + + +class TestDetectChainGaps: + """Tests for detect_chain_gaps function.""" + + def test_no_gaps_in_continuous_peptide(self, continuous_peptide_pdb): + """Test that no gaps are detected in a continuous peptide.""" + gaps = detect_chain_gaps(continuous_peptide_pdb, check_distance=False) + assert len(gaps) == 0 + + def test_detects_single_gap(self, gapped_peptide_pdb): + """Test detection of a single gap.""" + gaps = detect_chain_gaps(gapped_peptide_pdb, check_distance=False) + assert len(gaps) == 1 + assert gaps[0].chain_id == "A" + assert gaps[0].residue_before == 2 + assert gaps[0].residue_after == 5 + + def test_detects_multiple_gaps(self, multi_gap_pdb): + """Test detection of multiple gaps.""" + gaps = detect_chain_gaps(multi_gap_pdb, check_distance=False) + assert len(gaps) == 2 + + # First gap: 1 -> 5 + assert gaps[0].chain_id == "A" + assert gaps[0].residue_before == 1 + assert gaps[0].residue_after == 5 + + # Second gap: 5 -> 10 + assert gaps[1].chain_id == "A" + assert gaps[1].residue_before == 5 + assert gaps[1].residue_after == 10 + + def test_detects_gap_in_specific_chain(self, multi_chain_gapped_pdb): + """Test that gaps are correctly attributed to their chains.""" + gaps = detect_chain_gaps(multi_chain_gapped_pdb, check_distance=False) + + # Only chain A has a gap + assert len(gaps) == 1 + assert gaps[0].chain_id == "A" + + def test_distance_check_finds_large_cn_distance(self, gapped_peptide_pdb): + """Test that distance checking finds large C-N distances.""" + gaps = detect_chain_gaps(gapped_peptide_pdb, check_distance=True) + + # Should find the gap + assert len(gaps) >= 1 + # The gap should have a distance measurement + gap = gaps[0] + assert gap.distance is None or gap.distance > 2.0 + + +class TestChainGap: + """Tests for ChainGap dataclass.""" + + def test_str_representation(self): + """Test string representation of ChainGap.""" + gap = ChainGap( + chain_id="A", + residue_before=10, + residue_after=15, + distance=5.5, + ) + s = str(gap) + assert "A" in s + assert "4 residues" in s # 15 - 10 - 1 = 4 missing + assert "10" in s + assert "15" in s + assert "5.50" in s + + def test_str_without_distance(self): + """Test string representation without distance.""" + gap = ChainGap( + chain_id="B", + residue_before=1, + residue_after=5, + ) + s = str(gap) + assert "B" in s + assert "3 residues" in s + assert "dist=" not in s + + +class TestSplitChainsAtGaps: + """Tests for split_chains_at_gaps function.""" + + def test_no_split_without_gaps(self, continuous_peptide_pdb): + """Test that continuous peptide is not split.""" + result, mapping = split_chains_at_gaps(continuous_peptide_pdb) + assert result == continuous_peptide_pdb + assert mapping == {} + + def test_splits_at_single_gap(self, gapped_peptide_pdb): + """Test splitting at a single gap.""" + result, mapping = split_chains_at_gaps(gapped_peptide_pdb) + + # Should have new chain ID for segment after gap + assert len(mapping) > 0 + + # Check that atoms after gap have new chain ID + lines = result.split("\n") + chains_found = set() + for line in lines: + if line.startswith("ATOM") and len(line) > 21: + chains_found.add(line[21]) + + # Should have at least 2 chains (original A + new segment) + assert len(chains_found) >= 2 + + def test_splits_at_multiple_gaps(self, multi_gap_pdb): + """Test splitting at multiple gaps.""" + result, mapping = split_chains_at_gaps(multi_gap_pdb) + + # Count unique chains in result + lines = result.split("\n") + chains_found = set() + for line in lines: + if line.startswith("ATOM") and len(line) > 21: + chains_found.add(line[21]) + + # Should have 3 segments (original + 2 new) + assert len(chains_found) == 3 + + def test_preserves_atom_coordinates(self, gapped_peptide_pdb): + """Test that splitting preserves atom coordinates.""" + result, _ = split_chains_at_gaps(gapped_peptide_pdb) + + # Count atoms in original and result + orig_atoms = sum( + 1 + for line in gapped_peptide_pdb.split("\n") + if line.startswith("ATOM") + ) + result_atoms = sum( + 1 for line in result.split("\n") if line.startswith("ATOM") + ) + + assert orig_atoms == result_atoms + + def test_mapping_tracks_original_chain(self, gapped_peptide_pdb): + """Test that mapping correctly tracks original chain IDs.""" + _, mapping = split_chains_at_gaps(gapped_peptide_pdb) + + # All mapped chains should map back to A + for orig_chain in mapping.values(): + assert orig_chain == "A" + + +class TestRestoreChainIds: + """Tests for restore_chain_ids function.""" + + def test_no_change_without_mapping(self, gapped_peptide_pdb): + """Test that PDB is unchanged without mapping.""" + result = restore_chain_ids(gapped_peptide_pdb, {}) + assert result == gapped_peptide_pdb + + def test_restores_original_chain_ids(self, gapped_peptide_pdb): + """Test that original chain IDs are restored after splitting.""" + # Split chains + split_pdb, mapping = split_chains_at_gaps(gapped_peptide_pdb) + + # Restore original chain IDs + restored = restore_chain_ids(split_pdb, mapping) + + # All atoms should be in chain A again + for line in restored.split("\n"): + if line.startswith("ATOM") and len(line) > 21: + assert line[21] == "A" + + def test_roundtrip_preserves_atoms(self, gapped_peptide_pdb): + """Test split -> restore roundtrip preserves atom count.""" + orig_atoms = sum( + 1 + for line in gapped_peptide_pdb.split("\n") + if line.startswith("ATOM") + ) + + split_pdb, mapping = split_chains_at_gaps(gapped_peptide_pdb) + restored = restore_chain_ids(split_pdb, mapping) + + restored_atoms = sum( + 1 for line in restored.split("\n") if line.startswith("ATOM") + ) + + assert orig_atoms == restored_atoms + + +class TestAddTerRecordsAtGaps: + """Tests for add_ter_records_at_gaps function.""" + + def test_no_ter_without_gaps(self, continuous_peptide_pdb): + """Test no TER records added to continuous peptide.""" + gaps = detect_chain_gaps(continuous_peptide_pdb, check_distance=False) + result = add_ter_records_at_gaps(continuous_peptide_pdb, gaps) + assert result == continuous_peptide_pdb + + def test_adds_ter_at_gap(self, gapped_peptide_pdb): + """Test TER record is added at gap location.""" + gaps = detect_chain_gaps(gapped_peptide_pdb, check_distance=False) + result = add_ter_records_at_gaps(gapped_peptide_pdb, gaps) + + # Count TER records + ter_count = sum( + 1 for line in result.split("\n") if line.startswith("TER") + ) + assert ter_count >= 1 + + +class TestGetGapSummary: + """Tests for get_gap_summary function.""" + + def test_no_gaps_message(self): + """Test message when no gaps detected.""" + summary = get_gap_summary([]) + assert "No chain gaps detected" in summary + + def test_summary_with_gaps(self): + """Test summary includes gap information.""" + gaps = [ + ChainGap(chain_id="A", residue_before=10, residue_after=15), + ChainGap(chain_id="B", residue_before=5, residue_after=20), + ] + summary = get_gap_summary(gaps) + + assert "2 chain gap(s)" in summary + assert "Chain A" in summary + assert "Chain B" in summary From e1393e7b74dadd0336059d25dcccf8674e23a7d6 Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 12 Jan 2026 15:10:10 +0000 Subject: [PATCH 04/22] Fix chain splitting to assign new chain ID once per segment, not per atom The bug was assigning a new chain ID for every atom at a gap start residue instead of just once when entering a new segment. Added tracking of processed gap starts to prevent duplicate chain assignments. --- src/graphrelax/chain_gaps.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/src/graphrelax/chain_gaps.py b/src/graphrelax/chain_gaps.py index 49b15ae..e7404b7 100644 --- a/src/graphrelax/chain_gaps.py +++ b/src/graphrelax/chain_gaps.py @@ -171,9 +171,11 @@ def split_chains_at_gaps( current_chain_map = {} # original_chain_id -> current_new_chain_id next_id_idx = 0 + # Track which gap starts we've already processed + processed_gap_starts = set() + # Process PDB line by line output_lines = [] - current_residue = {} # chain_id -> (resnum, icode) for line in pdb_string.split("\n"): if line.startswith(("ATOM", "HETATM")) and len(line) > 26: @@ -181,8 +183,12 @@ def split_chains_at_gaps( resnum = int(line[22:26].strip()) icode = line[26].strip() if len(line) > 26 else "" + gap_key = (chain_id, resnum, icode) + # Check if this residue starts a new segment (at a gap) - if (chain_id, resnum, icode) in gap_starts: + # Only process each gap start once + if gap_key in gap_starts and gap_key not in processed_gap_starts: + processed_gap_starts.add(gap_key) # Assign a new chain ID for this segment if next_id_idx < len(available_ids): new_chain_id = available_ids[next_id_idx] @@ -209,8 +215,6 @@ def split_chains_at_gaps( new_chain_id = current_chain_map[chain_id] line = line[:21] + new_chain_id + line[22:] - current_residue[chain_id] = (resnum, icode) - elif line.startswith("TER") and len(line) > 21: # Update TER record chain ID too chain_id = line[21] From 7bb847b45815f1901801df8e13c3c0c0004163ff Mon Sep 17 00:00:00 2001 From: delalamo Date: Tue, 13 Jan 2026 09:00:25 -0500 Subject: [PATCH 05/22] Fix CI disk space issue in integration tests - Free up disk space by removing unused .NET, GHC, and Boost packages - Install CPU-only PyTorch to avoid large CUDA dependencies - Use --no-cache-dir to minimize pip cache usage The GitHub Actions runner was running out of disk space when installing PyTorch with CUDA dependencies (~5-7GB) alongside conda packages. Co-Authored-By: Claude Opus 4.5 --- .github/workflows/test.yml | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index d7a2a6f..0b0c004 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -41,13 +41,21 @@ jobs: activate-environment: test auto-activate-base: false + - name: Free up disk space + run: | + sudo rm -rf /usr/share/dotnet + sudo rm -rf /opt/ghc + sudo rm -rf /usr/local/share/boost + df -h + - name: Install conda dependencies run: | conda install -y openmm pdbfixer - name: Install pip dependencies run: | - pip install -e .[test] + pip install torch --index-url https://download.pytorch.org/whl/cpu + pip install --no-cache-dir -e .[test] - name: Download LigandMPNN weights run: | From 34bb8727868be8f078fe24e86677f2f4da073b07 Mon Sep 17 00:00:00 2001 From: delalamo Date: Tue, 13 Jan 2026 09:30:41 -0500 Subject: [PATCH 06/22] Add ligand-aware unconstrained minimization support Enable unconstrained minimization for protein-ligand complexes using openmmforcefields for small molecule parameterization. Changes: - Add include_ligands, ligand_forcefield, and ligand_smiles config options - Create ligand_utils.py module for ligand extraction and parameterization - Add _relax_unconstrained_with_ligands method using SystemGenerator - Update CLI with --include-ligands, --ligand-forcefield, --ligand-smiles - Add [ligands] optional dependency group to pyproject.toml - Add unit tests for ligand utilities - Add ligand_utils.py to pylint exclude (optional dependency imports) The implementation: 1. Separates protein (ATOM) and ligands (HETATM) from PDB 2. Processes protein with pdbfixer (avoiding terminal detection issues) 3. Parameterizes ligands with OpenFF Sage 2.0 (default) or GAFF2/Espaloma 4. Combines topologies and minimizes together Usage: graphrelax -i complex.pdb -o minimized.pdb --include-ligands graphrelax -i complex.pdb -o minimized.pdb --include-ligands \ --ligand-forcefield gaff-2.11 --ligand-smiles 'LIG:c1ccccc1' Requires: pip install graphrelax[ligands] Co-Authored-By: Claude Opus 4.5 --- .pre-commit-config.yaml | 2 +- pyproject.toml | 10 ++ src/graphrelax/cli.py | 74 +++++++++-- src/graphrelax/config.py | 15 ++- src/graphrelax/ligand_utils.py | 236 +++++++++++++++++++++++++++++++++ src/graphrelax/relaxer.py | 176 ++++++++++++++++++++++++ tests/test_ligand_utils.py | 163 +++++++++++++++++++++++ 7 files changed, 666 insertions(+), 10 deletions(-) create mode 100644 src/graphrelax/ligand_utils.py create mode 100644 tests/test_ligand_utils.py diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 9b476cb..362fbc1 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -31,7 +31,7 @@ repos: - "--enable=C0415" - "--score=n" name: pylint (no lazy imports) - exclude: "^(src/graphrelax/__init__.py|src/graphrelax/cli.py|src/graphrelax/designer.py|src/graphrelax/relaxer.py|src/graphrelax/utils.py|src/graphrelax/LigandMPNN/|tests/)" + exclude: "^(src/graphrelax/__init__.py|src/graphrelax/cli.py|src/graphrelax/designer.py|src/graphrelax/relaxer.py|src/graphrelax/utils.py|src/graphrelax/ligand_utils.py|src/graphrelax/LigandMPNN/|tests/)" - repo: https://github.com/pre-commit/mirrors-prettier rev: v4.0.0-alpha.8 diff --git a/pyproject.toml b/pyproject.toml index 82b2ac4..266e8a0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -36,6 +36,10 @@ dependencies = [ # Note: pdbfixer is not on PyPI, install via conda: # conda install -c conda-forge pdbfixer +# For ligand support in unconstrained minimization: +# pip install graphrelax[ligands] +# or: +# conda install -c conda-forge openmmforcefields openff-toolkit rdkit [project.urls] Homepage = "https://github.com/delalamo/GraphRelax" @@ -63,6 +67,12 @@ test = [ "pytest>=7.0", "pytest-cov>=4.1", ] +ligands = [ + # For ligand parameterization in unconstrained minimization + "openmmforcefields>=0.13.0", + "openff-toolkit>=0.14.0", + "rdkit>=2023.09.1", +] [project.scripts] graphrelax = "graphrelax.cli:main" diff --git a/src/graphrelax/cli.py b/src/graphrelax/cli.py index 870d1c4..0a66860 100644 --- a/src/graphrelax/cli.py +++ b/src/graphrelax/cli.py @@ -216,6 +216,36 @@ def create_parser() -> argparse.ArgumentParser: "to prevent artificial gap closure during minimization." ), ) + relax_group.add_argument( + "--include-ligands", + action="store_true", + help=( + "Include ligands in unconstrained minimization using " + "openmmforcefields. Requires: pip install openmmforcefields " + "openff-toolkit. If not specified with ligands present, " + "--constrained-minimization is required." + ), + ) + relax_group.add_argument( + "--ligand-forcefield", + choices=["openff-2.0.0", "gaff-2.11", "espaloma-0.3.0"], + default="openff-2.0.0", + metavar="FF", + help=( + "Force field for ligand parameterization (default: openff-2.0.0)." + " Options: openff-2.0.0 (Sage), gaff-2.11 (GAFF2), espaloma-0.3.0" + ), + ) + relax_group.add_argument( + "--ligand-smiles", + type=str, + action="append", + metavar="RESNAME:SMILES", + help=( + "Provide SMILES for a ligand residue. Format: RESNAME:SMILES. " + "Can be used multiple times. Example: --ligand-smiles LIG:SMILES" + ), + ) # Scoring options score_group = parser.add_argument_group("Scoring options") @@ -307,19 +337,31 @@ def main(args=None) -> int: input_format = detect_format(opts.input) has_ligands = _check_for_ligands(opts.input, input_format) - # Validate: ligand_mpnn with ligands requires constrained minimization + # Validate: ligand handling requires --include-ligands or --constrained uses_relaxation = mode in ( PipelineMode.RELAX, PipelineMode.NO_REPACK, PipelineMode.DESIGN, ) - if has_ligands and uses_relaxation and not opts.constrained_minimization: - logger.error( - "Input PDB contains ligands (HETATM records). " - "Unconstrained minimization cannot handle non-standard residues. " - "Please use --constrained-minimization flag." - ) - return 1 + if has_ligands and uses_relaxation: + if opts.include_ligands: + # Verify openmmforcefields is available + try: + import openmmforcefields # noqa: F401 + except ImportError: + logger.error( + "The --include-ligands flag requires openmmforcefields. " + "Install with: pip install openmmforcefields openff-toolkit" + ) + return 1 + logger.info("Ligand support enabled via openmmforcefields") + elif not opts.constrained_minimization: + logger.error( + "Input PDB contains ligands (HETATM records). Use one of:\n" + " 1. --include-ligands (requires openmmforcefields)\n" + " 2. --constrained-minimization" + ) + return 1 logger.info(f"Running GraphRelax in {mode.value} mode") logger.info(f"Input: {opts.input}") @@ -333,11 +375,27 @@ def main(args=None) -> int: seed=opts.seed, ) + # Parse ligand SMILES if provided + ligand_smiles = {} + if opts.ligand_smiles: + for entry in opts.ligand_smiles: + if ":" not in entry: + logger.error( + f"Invalid --ligand-smiles format: '{entry}'. " + "Expected format: RESNAME:SMILES" + ) + return 1 + resname, smiles = entry.split(":", 1) + ligand_smiles[resname.strip().upper()] = smiles.strip() + relax_config = RelaxConfig( stiffness=opts.stiffness, max_iterations=opts.max_iterations, constrained=opts.constrained_minimization, split_chains_at_gaps=not opts.no_split_gaps, + include_ligands=opts.include_ligands, + ligand_forcefield=opts.ligand_forcefield, + ligand_smiles=ligand_smiles, ) pipeline_config = PipelineConfig( diff --git a/src/graphrelax/config.py b/src/graphrelax/config.py index 7f55c99..c00f37a 100644 --- a/src/graphrelax/config.py +++ b/src/graphrelax/config.py @@ -3,7 +3,9 @@ from dataclasses import dataclass, field from enum import Enum from pathlib import Path -from typing import Literal, Optional +from typing import Dict, Literal, Optional + +LigandForceField = Literal["openff-2.0.0", "gaff-2.11", "espaloma-0.3.0"] class PipelineMode(Enum): @@ -44,6 +46,17 @@ class RelaxConfig: split_chains_at_gaps: bool = True # Split chains at gaps to prevent closure # GPU is auto-detected and used when available + # Ligand support options + include_ligands: bool = ( + False # Enable ligand parameterization in unconstrained + ) + ligand_forcefield: LigandForceField = ( + "openff-2.0.0" # Force field for ligands + ) + ligand_smiles: Dict[str, str] = field( + default_factory=dict + ) # {resname: SMILES} + @dataclass class PipelineConfig: diff --git a/src/graphrelax/ligand_utils.py b/src/graphrelax/ligand_utils.py new file mode 100644 index 0000000..d5e5e9b --- /dev/null +++ b/src/graphrelax/ligand_utils.py @@ -0,0 +1,236 @@ +"""Utilities for ligand parameterization and handling.""" + +import io +import logging +from dataclasses import dataclass +from typing import Dict, List, Optional, Tuple + +logger = logging.getLogger(__name__) + +# Water residue names to exclude from ligand processing +WATER_RESIDUES = {"HOH", "WAT", "SOL", "TIP3", "TIP4", "SPC"} + + +@dataclass +class LigandInfo: + """Information about a ligand extracted from PDB.""" + + resname: str + chain_id: str + resnum: int + pdb_lines: List[str] + smiles: Optional[str] = None + + +def extract_ligands_from_pdb(pdb_string: str) -> Tuple[str, List[LigandInfo]]: + """ + Separate protein ATOM records from ligand HETATM records. + + Args: + pdb_string: Full PDB string with protein and ligands + + Returns: + Tuple of (protein_only_pdb, list_of_ligand_info) + """ + protein_lines = [] + ligand_lines_by_residue = {} + + for line in pdb_string.split("\n"): + if line.startswith("HETATM"): + resname = line[17:20].strip() + chain_id = line[21] if len(line) > 21 else " " + try: + resnum = int(line[22:26].strip()) + except ValueError: + resnum = 0 + + # Skip water + if resname in WATER_RESIDUES: + continue + + key = (chain_id, resnum, resname) + if key not in ligand_lines_by_residue: + ligand_lines_by_residue[key] = [] + ligand_lines_by_residue[key].append(line) + elif line.startswith("END"): + pass # Skip, will add back + else: + protein_lines.append(line) + + protein_pdb = "\n".join(protein_lines) + "\nEND\n" + + ligands = [] + for (chain_id, resnum, resname), lines in ligand_lines_by_residue.items(): + ligands.append( + LigandInfo( + resname=resname, + chain_id=chain_id, + resnum=resnum, + pdb_lines=lines, + ) + ) + + return protein_pdb, ligands + + +def get_common_ligand_smiles() -> Dict[str, str]: + """ + Return SMILES for commonly encountered ligands. + + This helps with ligands where automatic bond perception may fail. + """ + return { + # Heme and porphyrins + "HEM": ( + "[Fe+2]12([N-]3C(=C4C(=C([N-]1C(=C5C(C(=C([N-]2C(=C3C)C(=C)" + "C)C=C5C)CCC(=O)O)C)C=C6[N-]4C(=C(C)C6C=C)C(C)=C)CCC(=O)O)C)C=C)C" + ), + "HEC": ( + "[Fe+2]12([N-]3C(=C4C(=C([N-]1C(=C5C(C(=C([N-]2C(=C3C)C(=C)" + "C)C=C5C)CCC(=O)O)C)C=C6[N-]4C(=C(C)C6C=C)C(C)=C)CCC(=O)O)C)C=C)C" + ), + # Common cofactors + "NAD": ( + "NC(=O)c1ccc[n+](c1)[C@@H]1O[C@H](COP(=O)([O-])OP(=O)([O-])" + "OC[C@H]2O[C@H]([C@H](O)[C@@H]2O)n2cnc3c(N)ncnc23)[C@@H](O)[C@H]1O" + ), + "NAP": ( # NADP + "NC(=O)c1ccc[n+](c1)[C@@H]1O[C@H](COP(=O)([O-])OP(=O)([O-])" + "OC[C@H]2O[C@H]([C@H](OP(=O)([O-])[O-])[C@@H]2O)n2cnc3c(N)ncnc23)" + "[C@@H](O)[C@H]1O" + ), + "FAD": ( + "Cc1cc2nc3c(=O)[nH]c(=O)nc-3n(C[C@H](O)[C@H](O)[C@H](O)" + "COP(=O)([O-])OP(=O)([O-])OC[C@H]3O[C@H]([C@H](O)[C@@H]3O)" + "n3cnc4c(N)ncnc43)c2cc1C" + ), + "FMN": ( + "Cc1cc2nc3c(=O)[nH]c(=O)nc-3n(C[C@H](O)[C@H](O)[C@H](O)" + "COP(=O)([O-])[O-])c2cc1C" + ), + "ATP": ( + "Nc1ncnc2n(cnc12)[C@@H]1O[C@H](COP(=O)([O-])OP(=O)([O-])" + "OP(=O)([O-])[O-])[C@@H](O)[C@H]1O" + ), + "ADP": ( + "Nc1ncnc2n(cnc12)[C@@H]1O[C@H](COP(=O)([O-])OP(=O)([O-])[O-])" + "[C@@H](O)[C@H]1O" + ), + "AMP": ( + "Nc1ncnc2n(cnc12)[C@@H]1O[C@H](COP(=O)([O-])[O-])[C@@H](O)[C@H]1O" + ), + "GTP": ( + "Nc1nc2n(cnc2c(=O)[nH]1)[C@@H]1O[C@H](COP(=O)([O-])OP(=O)([O-])" + "OP(=O)([O-])[O-])[C@@H](O)[C@H]1O" + ), + # Common ions (simple) + "ZN": "[Zn+2]", + "MG": "[Mg+2]", + "CA": "[Ca+2]", + "FE": "[Fe+2]", + "FE2": "[Fe+2]", + "MN": "[Mn+2]", + "CU": "[Cu+2]", + "CO": "[Co+2]", + # Common small molecules + "ACE": "CC(=O)", # Acetyl + "NME": "NC", # N-methyl + "ACT": "CC(=O)[O-]", # Acetate + "GOL": "OCC(O)CO", # Glycerol + "EDO": "OCCO", # Ethylene glycol + "PEG": "COCCOCCOCCO", # PEG fragment + "SO4": "[O-]S(=O)(=O)[O-]", # Sulfate + "PO4": "[O-]P(=O)([O-])[O-]", # Phosphate + "CL": "[Cl-]", # Chloride + } + + +def create_openff_molecule(ligand: LigandInfo, smiles: Optional[str] = None): + """ + Create an OpenFF Toolkit Molecule from ligand info. + + Args: + ligand: LigandInfo with PDB coordinates + smiles: Optional SMILES string (if known) + + Returns: + openff.toolkit.Molecule + """ + from openff.toolkit import Molecule + + if smiles: + # Create from SMILES + try: + mol = Molecule.from_smiles(smiles, allow_undefined_stereo=True) + logger.debug(f"Created molecule for {ligand.resname} from SMILES") + return mol + except Exception as e: + logger.warning(f"Failed to create from SMILES: {e}") + + # Try to create from PDB block + pdb_block = "\n".join(ligand.pdb_lines) + "\nEND\n" + + try: + # Try direct PDB parsing + mol = Molecule.from_pdb_file( + io.StringIO(pdb_block), + allow_undefined_stereo=True, + ) + logger.debug(f"Created molecule for {ligand.resname} from PDB") + return mol + except Exception as e: + logger.debug(f"Direct PDB parsing failed: {e}") + + # Fallback: use RDKit + try: + mol = _create_molecule_via_rdkit(pdb_block) + logger.debug(f"Created molecule for {ligand.resname} via RDKit") + return mol + except Exception as e: + raise ValueError( + f"Could not create molecule for ligand {ligand.resname}: {e}" + ) + + +def _create_molecule_via_rdkit(pdb_block: str): + """Create OpenFF Molecule via RDKit from PDB block.""" + from openff.toolkit import Molecule + from rdkit import Chem + from rdkit.Chem import AllChem + + # Parse PDB with RDKit + mol = Chem.MolFromPDBBlock(pdb_block, removeHs=False, sanitize=False) + + if mol is None: + raise ValueError("RDKit could not parse PDB block") + + # Try to sanitize + try: + Chem.SanitizeMol(mol) + except Exception: + # Try without hydrogens and re-add them + mol = Chem.MolFromPDBBlock(pdb_block, removeHs=True, sanitize=True) + if mol is None: + raise ValueError("RDKit sanitization failed") + mol = Chem.AddHs(mol, addCoords=True) + AllChem.EmbedMolecule(mol, randomSeed=42) + + # Convert to OpenFF Molecule + return Molecule.from_rdkit(mol, allow_undefined_stereo=True) + + +def ligand_pdb_to_topology(ligand: LigandInfo): + """ + Convert ligand PDB lines to OpenMM topology and positions. + + Args: + ligand: LigandInfo with PDB lines + + Returns: + Tuple of (topology, positions) + """ + from openmm import app as openmm_app + + pdb_block = "\n".join(ligand.pdb_lines) + "\nEND\n" + pdb_file = openmm_app.PDBFile(io.StringIO(pdb_block)) + return pdb_file.topology, pdb_file.positions diff --git a/src/graphrelax/relaxer.py b/src/graphrelax/relaxer.py index de351a5..e4b1a5f 100644 --- a/src/graphrelax/relaxer.py +++ b/src/graphrelax/relaxer.py @@ -18,6 +18,21 @@ split_chains_at_gaps, ) from graphrelax.config import RelaxConfig +from graphrelax.ligand_utils import ( + WATER_RESIDUES, + create_openff_molecule, + extract_ligands_from_pdb, + get_common_ligand_smiles, + ligand_pdb_to_topology, +) + +# Check if openmmforcefields is available for ligand support +try: + from openmmforcefields.generators import SystemGenerator + + OPENMMFF_AVAILABLE = True +except ImportError: + OPENMMFF_AVAILABLE = False # Add vendored LigandMPNN to path for OpenFold imports # Must happen before importing from openfold @@ -156,12 +171,30 @@ def _relax_unconstrained( No position restraints, no violation checking, uses OpenMM defaults. This is the default minimization mode. + If config.include_ligands is True and ligands are present, uses + openmmforcefields for ligand parameterization. + Args: pdb_string: PDB file contents as string Returns: Tuple of (relaxed_pdb_string, debug_info, violations) """ + # Check if ligands are present (non-water HETATM records) + has_ligands = any( + line.startswith("HETATM") + and line[17:20].strip() not in WATER_RESIDUES + for line in pdb_string.split("\n") + ) + + if has_ligands and self.config.include_ligands: + if not OPENMMFF_AVAILABLE: + raise ImportError( + "openmmforcefields is required for ligand support. " + "Install with: pip install openmmforcefields openff-toolkit" + ) + return self._relax_unconstrained_with_ligands(pdb_string) + ENERGY = unit.kilocalories_per_mole LENGTH = unit.angstroms @@ -261,6 +294,149 @@ def _relax_unconstrained( return relaxed_pdb, debug_data, violations + def _relax_unconstrained_with_ligands( + self, pdb_string: str + ) -> Tuple[str, dict, np.ndarray]: + """ + Unconstrained OpenMM minimization with ligand support. + + Uses openmmforcefields to parameterize ligands with GAFF2/OpenFF. + The protein is processed separately with pdbfixer, then combined + with parameterized ligands for minimization. + + Args: + pdb_string: PDB file contents as string + + Returns: + Tuple of (relaxed_pdb_string, debug_info, violations) + """ + from pdbfixer import PDBFixer + + ENERGY = unit.kilocalories_per_mole + LENGTH = unit.angstroms + + use_gpu = self._check_gpu_available() + + logger.info( + f"Running unconstrained minimization with ligands " + f"(forcefield={self.config.ligand_forcefield}, gpu={use_gpu})" + ) + + # Step 1: Separate protein and ligands + protein_pdb, ligands = extract_ligands_from_pdb(pdb_string) + logger.info( + f"Found {len(ligands)} ligand(s): {[l.resname for l in ligands]}" + ) + + # Step 2: Fix protein with pdbfixer (without ligands) + fixer = PDBFixer(pdbfile=io.StringIO(protein_pdb)) + fixer.findMissingResidues() + fixer.findMissingAtoms() + fixer.addMissingAtoms() + + # Step 3: Create OpenFF molecules for ligands + known_smiles = get_common_ligand_smiles() + known_smiles.update(self.config.ligand_smiles or {}) + + openff_molecules = [] + for ligand in ligands: + smiles = known_smiles.get(ligand.resname) + try: + mol = create_openff_molecule(ligand, smiles=smiles) + openff_molecules.append(mol) + logger.debug(f"Created OpenFF molecule for {ligand.resname}") + except Exception as e: + resname = ligand.resname + raise ValueError( + f"Could not parameterize ligand '{resname}' " + f"(chain {ligand.chain_id}, res {ligand.resnum}): {e}\n\n" + f"Options to resolve:\n" + f" 1. Provide SMILES via config: " + f"ligand_smiles={{'{resname}': 'SMILES_STRING'}}\n" + f" 2. Use constrained minimization: constrained=True\n" + f" 3. Remove the ligand from the input PDB\n\n" + f"For common ligands, see: " + f"https://www.ebi.ac.uk/pdbe-srv/pdbechem/" + ) + + # Step 4: Create combined topology using Modeller + modeller = openmm_app.Modeller(fixer.topology, fixer.positions) + + # Add ligands back to modeller + for ligand in ligands: + ligand_topology, ligand_positions = ligand_pdb_to_topology(ligand) + modeller.add(ligand_topology, ligand_positions) + + # Step 5: Create SystemGenerator with ligand support + system_generator = SystemGenerator( + forcefields=["amber/ff14SB.xml"], + small_molecule_forcefield=self.config.ligand_forcefield, + molecules=openff_molecules, + forcefield_kwargs={ + "constraints": openmm_app.HBonds, + "removeCMMotion": True, + }, + ) + + # Add hydrogens + modeller.addHydrogens(system_generator.forcefield) + + # Create system + system = system_generator.create_system(modeller.topology) + + # Step 6: Run minimization + integrator = openmm.LangevinIntegrator(0, 0.01, 0.0) + platform = openmm.Platform.getPlatformByName( + "CUDA" if use_gpu else "CPU" + ) + simulation = openmm_app.Simulation( + modeller.topology, system, integrator, platform + ) + simulation.context.setPositions(modeller.positions) + + # Get initial energy + state = simulation.context.getState(getEnergy=True, getPositions=True) + einit = state.getPotentialEnergy().value_in_unit(ENERGY) + posinit = state.getPositions(asNumpy=True).value_in_unit(LENGTH) + + # Minimize + if self.config.max_iterations > 0: + simulation.minimizeEnergy(maxIterations=self.config.max_iterations) + else: + simulation.minimizeEnergy() + + # Get final state + state = simulation.context.getState(getEnergy=True, getPositions=True) + efinal = state.getPotentialEnergy().value_in_unit(ENERGY) + pos = state.getPositions(asNumpy=True).value_in_unit(LENGTH) + + # Calculate RMSD + rmsd = np.sqrt(np.sum((posinit - pos) ** 2) / len(posinit)) + + # Write output PDB + output = io.StringIO() + openmm_app.PDBFile.writeFile( + simulation.topology, state.getPositions(), output + ) + relaxed_pdb = output.getvalue() + + debug_data = { + "initial_energy": einit, + "final_energy": efinal, + "rmsd": rmsd, + "attempts": 1, + "ligands_included": [lig.resname for lig in ligands], + "ligand_forcefield": self.config.ligand_forcefield, + } + + logger.info( + f"Minimization complete: E_init={einit:.2f}, " + f"E_final={efinal:.2f}, RMSD={rmsd:.3f} A" + ) + + violations = np.zeros(0) + return relaxed_pdb, debug_data, violations + def _relax_direct(self, pdb_string: str) -> Tuple[str, dict, np.ndarray]: """ Direct OpenMM minimization without pdbfixer. diff --git a/tests/test_ligand_utils.py b/tests/test_ligand_utils.py new file mode 100644 index 0000000..f83b1b0 --- /dev/null +++ b/tests/test_ligand_utils.py @@ -0,0 +1,163 @@ +"""Unit tests for ligand utilities.""" + +from graphrelax.ligand_utils import ( + WATER_RESIDUES, + LigandInfo, + extract_ligands_from_pdb, + get_common_ligand_smiles, +) + + +class TestExtractLigands: + """Tests for ligand extraction from PDB.""" + + def test_extract_no_ligands(self, small_peptide_pdb_string): + """Test extraction from protein-only PDB.""" + protein_pdb, ligands = extract_ligands_from_pdb( + small_peptide_pdb_string + ) + + assert len(ligands) == 0 + assert "ATOM" in protein_pdb + assert "END" in protein_pdb + + def test_extract_with_ligand(self): + """Test extraction of a ligand from PDB.""" + # fmt: off + pdb_with_ligand = ( + "ATOM 1 N ALA A 1 0.0 0.0 0.0 1.00 0.00\n" + "ATOM 2 CA ALA A 1 1.5 0.0 0.0 1.00 0.00\n" + "ATOM 3 C ALA A 1 2.0 1.4 0.0 1.00 0.00\n" + "ATOM 4 O ALA A 1 1.2 2.4 0.0 1.00 0.00\n" + "HETATM 5 FE HEM A 200 5.0 5.0 5.0 1.00 0.00\n" + "HETATM 6 NA HEM A 200 3.5 5.0 5.0 1.00 0.00\n" + "HETATM 7 NB HEM A 200 5.0 3.5 5.0 1.00 0.00\n" + "END\n" + ) + # fmt: on + protein_pdb, ligands = extract_ligands_from_pdb(pdb_with_ligand) + + assert len(ligands) == 1 + assert ligands[0].resname == "HEM" + assert ligands[0].chain_id == "A" + assert ligands[0].resnum == 200 + assert len(ligands[0].pdb_lines) == 3 # FE, NA, NB + + # Check protein PDB doesn't have HETATM + assert "HETATM" not in protein_pdb + assert "ATOM" in protein_pdb + + def test_water_excluded(self): + """Test that water is not extracted as ligand.""" + # fmt: off + pdb_with_water = ( + "ATOM 1 N ALA A 1 0.0 0.0 0.0 1.00 0.00\n" + "ATOM 2 CA ALA A 1 1.5 0.0 0.0 1.00 0.00\n" + "HETATM 5 O HOH A 301 10.0 10.0 10.0 1.00 0.00\n" + "HETATM 6 O WAT A 302 11.0 10.0 10.0 1.00 0.00\n" + "HETATM 7 O SOL A 303 12.0 10.0 10.0 1.00 0.00\n" + "END\n" + ) + # fmt: on + protein_pdb, ligands = extract_ligands_from_pdb(pdb_with_water) + + # Waters should not be in ligand list + assert len(ligands) == 0 + assert not any(lig.resname in WATER_RESIDUES for lig in ligands) + + def test_multiple_ligands(self): + """Test extraction of multiple different ligands.""" + # fmt: off + pdb_with_multiple = ( + "ATOM 1 N ALA A 1 0.0 0.0 0.0 1.00 0.00\n" + "HETATM 5 FE HEM A 200 5.0 5.0 5.0 1.00 0.00\n" + "HETATM 6 N1 NAD B 301 8.0 8.0 8.0 1.00 0.00\n" + "END\n" + ) + # fmt: on + protein_pdb, ligands = extract_ligands_from_pdb(pdb_with_multiple) + + assert len(ligands) == 2 + resnames = {lig.resname for lig in ligands} + assert resnames == {"HEM", "NAD"} + + +class TestCommonSmiles: + """Tests for common ligand SMILES lookup.""" + + def test_heme_smiles_present(self): + """Test that HEM SMILES is defined.""" + smiles = get_common_ligand_smiles() + assert "HEM" in smiles + assert len(smiles["HEM"]) > 0 + # Should contain iron + assert "Fe" in smiles["HEM"] + + def test_atp_smiles_present(self): + """Test that ATP SMILES is defined.""" + smiles = get_common_ligand_smiles() + assert "ATP" in smiles + # ATP has phosphate groups + assert "P" in smiles["ATP"] + + def test_common_cofactors(self): + """Test that common cofactors are defined.""" + smiles = get_common_ligand_smiles() + expected = ["NAD", "FAD", "ATP", "ADP", "GTP"] + for cofactor in expected: + assert cofactor in smiles, f"{cofactor} should be in common SMILES" + + def test_common_ions(self): + """Test that common metal ions are defined.""" + smiles = get_common_ligand_smiles() + ions = ["ZN", "MG", "CA", "FE", "MN", "CU"] + for ion in ions: + assert ion in smiles, f"{ion} should be in common SMILES" + + +class TestLigandInfo: + """Tests for LigandInfo dataclass.""" + + def test_ligand_info_creation(self): + """Test creating a LigandInfo object.""" + ligand = LigandInfo( + resname="HEM", + chain_id="A", + resnum=200, + pdb_lines=[ + "HETATM 5 FE HEM A 200 5.000 5.000 5.000" + ], + ) + assert ligand.resname == "HEM" + assert ligand.chain_id == "A" + assert ligand.resnum == 200 + assert ligand.smiles is None + + def test_ligand_info_with_smiles(self): + """Test LigandInfo with optional SMILES.""" + ligand = LigandInfo( + resname="BEN", + chain_id="B", + resnum=1, + pdb_lines=[ + "HETATM 1 C1 BEN B 1 0.000 0.000 0.000" + ], + smiles="c1ccccc1", + ) + assert ligand.smiles == "c1ccccc1" + + +class TestWaterResidues: + """Tests for water residue constants.""" + + def test_common_water_names(self): + """Test that common water names are included.""" + assert "HOH" in WATER_RESIDUES + assert "WAT" in WATER_RESIDUES + assert "SOL" in WATER_RESIDUES + + def test_tip_models(self): + """Test that TIP water models are included.""" + assert "TIP3" in WATER_RESIDUES + assert "TIP4" in WATER_RESIDUES + assert "SPC" in WATER_RESIDUES From a1da6da02bafa1fc756447eb6cf8a37b20798d02 Mon Sep 17 00:00:00 2001 From: delalamo Date: Tue, 13 Jan 2026 09:35:35 -0500 Subject: [PATCH 07/22] Refactor ligand SMILES: use RDKit parsing instead of hardcoded dictionary Replace the large hardcoded SMILES dictionary with dynamic extraction from PDB coordinates using RDKit bond perception: - Rename get_common_ligand_smiles() to get_ion_smiles() (only ions need explicit SMILES since they're single atoms without bond info) - Add is_single_atom_ligand() helper function - Update create_openff_molecule() to try RDKit parsing first, then fall back to OpenFF PDB parsing, using ion lookup only for single atoms - Update relaxer.py to use the new approach - Update tests to reflect the refactored functions This removes ~70 lines of hardcoded SMILES while making the code more robust - it can now handle any ligand that RDKit can perceive bonds for. Co-Authored-By: Claude Opus 4.5 --- src/graphrelax/ligand_utils.py | 114 +++++++++++++-------------------- src/graphrelax/relaxer.py | 12 ++-- tests/test_ligand_utils.py | 74 +++++++++++++-------- 3 files changed, 96 insertions(+), 104 deletions(-) diff --git a/src/graphrelax/ligand_utils.py b/src/graphrelax/ligand_utils.py index d5e5e9b..0c5d702 100644 --- a/src/graphrelax/ligand_utils.py +++ b/src/graphrelax/ligand_utils.py @@ -73,57 +73,14 @@ def extract_ligands_from_pdb(pdb_string: str) -> Tuple[str, List[LigandInfo]]: return protein_pdb, ligands -def get_common_ligand_smiles() -> Dict[str, str]: +def get_ion_smiles() -> Dict[str, str]: """ - Return SMILES for commonly encountered ligands. + Return SMILES for common ions. - This helps with ligands where automatic bond perception may fail. + Ions are single atoms that cannot be parsed from PDB coordinates, + so we need explicit SMILES for them. """ return { - # Heme and porphyrins - "HEM": ( - "[Fe+2]12([N-]3C(=C4C(=C([N-]1C(=C5C(C(=C([N-]2C(=C3C)C(=C)" - "C)C=C5C)CCC(=O)O)C)C=C6[N-]4C(=C(C)C6C=C)C(C)=C)CCC(=O)O)C)C=C)C" - ), - "HEC": ( - "[Fe+2]12([N-]3C(=C4C(=C([N-]1C(=C5C(C(=C([N-]2C(=C3C)C(=C)" - "C)C=C5C)CCC(=O)O)C)C=C6[N-]4C(=C(C)C6C=C)C(C)=C)CCC(=O)O)C)C=C)C" - ), - # Common cofactors - "NAD": ( - "NC(=O)c1ccc[n+](c1)[C@@H]1O[C@H](COP(=O)([O-])OP(=O)([O-])" - "OC[C@H]2O[C@H]([C@H](O)[C@@H]2O)n2cnc3c(N)ncnc23)[C@@H](O)[C@H]1O" - ), - "NAP": ( # NADP - "NC(=O)c1ccc[n+](c1)[C@@H]1O[C@H](COP(=O)([O-])OP(=O)([O-])" - "OC[C@H]2O[C@H]([C@H](OP(=O)([O-])[O-])[C@@H]2O)n2cnc3c(N)ncnc23)" - "[C@@H](O)[C@H]1O" - ), - "FAD": ( - "Cc1cc2nc3c(=O)[nH]c(=O)nc-3n(C[C@H](O)[C@H](O)[C@H](O)" - "COP(=O)([O-])OP(=O)([O-])OC[C@H]3O[C@H]([C@H](O)[C@@H]3O)" - "n3cnc4c(N)ncnc43)c2cc1C" - ), - "FMN": ( - "Cc1cc2nc3c(=O)[nH]c(=O)nc-3n(C[C@H](O)[C@H](O)[C@H](O)" - "COP(=O)([O-])[O-])c2cc1C" - ), - "ATP": ( - "Nc1ncnc2n(cnc12)[C@@H]1O[C@H](COP(=O)([O-])OP(=O)([O-])" - "OP(=O)([O-])[O-])[C@@H](O)[C@H]1O" - ), - "ADP": ( - "Nc1ncnc2n(cnc12)[C@@H]1O[C@H](COP(=O)([O-])OP(=O)([O-])[O-])" - "[C@@H](O)[C@H]1O" - ), - "AMP": ( - "Nc1ncnc2n(cnc12)[C@@H]1O[C@H](COP(=O)([O-])[O-])[C@@H](O)[C@H]1O" - ), - "GTP": ( - "Nc1nc2n(cnc2c(=O)[nH]1)[C@@H]1O[C@H](COP(=O)([O-])OP(=O)([O-])" - "OP(=O)([O-])[O-])[C@@H](O)[C@H]1O" - ), - # Common ions (simple) "ZN": "[Zn+2]", "MG": "[Mg+2]", "CA": "[Ca+2]", @@ -132,60 +89,77 @@ def get_common_ligand_smiles() -> Dict[str, str]: "MN": "[Mn+2]", "CU": "[Cu+2]", "CO": "[Co+2]", - # Common small molecules - "ACE": "CC(=O)", # Acetyl - "NME": "NC", # N-methyl - "ACT": "CC(=O)[O-]", # Acetate - "GOL": "OCC(O)CO", # Glycerol - "EDO": "OCCO", # Ethylene glycol - "PEG": "COCCOCCOCCO", # PEG fragment - "SO4": "[O-]S(=O)(=O)[O-]", # Sulfate - "PO4": "[O-]P(=O)([O-])[O-]", # Phosphate - "CL": "[Cl-]", # Chloride + "NA": "[Na+]", + "K": "[K+]", + "CL": "[Cl-]", } +def is_single_atom_ligand(ligand: LigandInfo) -> bool: + """Check if ligand is a single atom (ion).""" + return len(ligand.pdb_lines) == 1 + + def create_openff_molecule(ligand: LigandInfo, smiles: Optional[str] = None): """ Create an OpenFF Toolkit Molecule from ligand info. + Attempts to create the molecule in this order: + 1. From user-provided SMILES (if given) + 2. From ion lookup table (for single-atom ligands) + 3. From PDB coordinates via RDKit bond perception + 4. From direct OpenFF PDB parsing + Args: ligand: LigandInfo with PDB coordinates - smiles: Optional SMILES string (if known) + smiles: Optional SMILES string (overrides automatic detection) Returns: openff.toolkit.Molecule """ from openff.toolkit import Molecule + # 1. User-provided SMILES takes precedence if smiles: - # Create from SMILES try: mol = Molecule.from_smiles(smiles, allow_undefined_stereo=True) logger.debug(f"Created molecule for {ligand.resname} from SMILES") return mol except Exception as e: - logger.warning(f"Failed to create from SMILES: {e}") + logger.warning(f"Failed to create from provided SMILES: {e}") + + # 2. Handle ions (single atoms) - need explicit SMILES + if is_single_atom_ligand(ligand): + ion_smiles = get_ion_smiles() + if ligand.resname in ion_smiles: + mol = Molecule.from_smiles( + ion_smiles[ligand.resname], allow_undefined_stereo=True + ) + logger.debug(f"Created ion molecule for {ligand.resname}") + return mol + else: + raise ValueError( + f"Unknown ion '{ligand.resname}'. Provide SMILES via " + f"ligand_smiles={{'{ligand.resname}': '[Element+charge]'}}" + ) - # Try to create from PDB block + # 3. Try RDKit bond perception from PDB coordinates pdb_block = "\n".join(ligand.pdb_lines) + "\nEND\n" + try: + mol = _create_molecule_via_rdkit(pdb_block) + logger.debug(f"Created molecule for {ligand.resname} via RDKit") + return mol + except Exception as e: + logger.debug(f"RDKit parsing failed: {e}") + # 4. Fallback: direct OpenFF PDB parsing try: - # Try direct PDB parsing mol = Molecule.from_pdb_file( io.StringIO(pdb_block), allow_undefined_stereo=True, ) logger.debug(f"Created molecule for {ligand.resname} from PDB") return mol - except Exception as e: - logger.debug(f"Direct PDB parsing failed: {e}") - - # Fallback: use RDKit - try: - mol = _create_molecule_via_rdkit(pdb_block) - logger.debug(f"Created molecule for {ligand.resname} via RDKit") - return mol except Exception as e: raise ValueError( f"Could not create molecule for ligand {ligand.resname}: {e}" diff --git a/src/graphrelax/relaxer.py b/src/graphrelax/relaxer.py index e4b1a5f..3373beb 100644 --- a/src/graphrelax/relaxer.py +++ b/src/graphrelax/relaxer.py @@ -22,7 +22,6 @@ WATER_RESIDUES, create_openff_molecule, extract_ligands_from_pdb, - get_common_ligand_smiles, ligand_pdb_to_topology, ) @@ -324,9 +323,8 @@ def _relax_unconstrained_with_ligands( # Step 1: Separate protein and ligands protein_pdb, ligands = extract_ligands_from_pdb(pdb_string) - logger.info( - f"Found {len(ligands)} ligand(s): {[l.resname for l in ligands]}" - ) + ligand_names = [lig.resname for lig in ligands] + logger.info(f"Found {len(ligands)} ligand(s): {ligand_names}") # Step 2: Fix protein with pdbfixer (without ligands) fixer = PDBFixer(pdbfile=io.StringIO(protein_pdb)) @@ -335,12 +333,12 @@ def _relax_unconstrained_with_ligands( fixer.addMissingAtoms() # Step 3: Create OpenFF molecules for ligands - known_smiles = get_common_ligand_smiles() - known_smiles.update(self.config.ligand_smiles or {}) + # User-provided SMILES override automatic detection + user_smiles = self.config.ligand_smiles or {} openff_molecules = [] for ligand in ligands: - smiles = known_smiles.get(ligand.resname) + smiles = user_smiles.get(ligand.resname) try: mol = create_openff_molecule(ligand, smiles=smiles) openff_molecules.append(mol) diff --git a/tests/test_ligand_utils.py b/tests/test_ligand_utils.py index f83b1b0..5187941 100644 --- a/tests/test_ligand_utils.py +++ b/tests/test_ligand_utils.py @@ -4,7 +4,8 @@ WATER_RESIDUES, LigandInfo, extract_ligands_from_pdb, - get_common_ligand_smiles, + get_ion_smiles, + is_single_atom_ligand, ) @@ -82,37 +83,56 @@ def test_multiple_ligands(self): assert resnames == {"HEM", "NAD"} -class TestCommonSmiles: - """Tests for common ligand SMILES lookup.""" - - def test_heme_smiles_present(self): - """Test that HEM SMILES is defined.""" - smiles = get_common_ligand_smiles() - assert "HEM" in smiles - assert len(smiles["HEM"]) > 0 - # Should contain iron - assert "Fe" in smiles["HEM"] - - def test_atp_smiles_present(self): - """Test that ATP SMILES is defined.""" - smiles = get_common_ligand_smiles() - assert "ATP" in smiles - # ATP has phosphate groups - assert "P" in smiles["ATP"] - - def test_common_cofactors(self): - """Test that common cofactors are defined.""" - smiles = get_common_ligand_smiles() - expected = ["NAD", "FAD", "ATP", "ADP", "GTP"] - for cofactor in expected: - assert cofactor in smiles, f"{cofactor} should be in common SMILES" +class TestIonSmiles: + """Tests for ion SMILES lookup.""" def test_common_ions(self): """Test that common metal ions are defined.""" - smiles = get_common_ligand_smiles() + smiles = get_ion_smiles() ions = ["ZN", "MG", "CA", "FE", "MN", "CU"] for ion in ions: - assert ion in smiles, f"{ion} should be in common SMILES" + assert ion in smiles, f"{ion} should be in ion SMILES" + + def test_ion_smiles_format(self): + """Test that ion SMILES have proper format with charge.""" + smiles = get_ion_smiles() + # All ions should have brackets indicating charged species + for ion, smi in smiles.items(): + assert smi.startswith("["), f"{ion} SMILES should start with [" + assert smi.endswith("]"), f"{ion} SMILES should end with ]" + + def test_halide_ions(self): + """Test that halide ions are defined.""" + smiles = get_ion_smiles() + assert "CL" in smiles + assert smiles["CL"] == "[Cl-]" + + +class TestSingleAtomLigand: + """Tests for single atom ligand detection.""" + + def test_ion_is_single_atom(self): + """Test that ions are detected as single atoms.""" + ligand = LigandInfo( + resname="ZN", + chain_id="A", + resnum=1, + pdb_lines=["HETATM 1 ZN ZN A 1 0.0 0.0 0.0"], + ) + assert is_single_atom_ligand(ligand) + + def test_heme_is_not_single_atom(self): + """Test that multi-atom ligands are not single atoms.""" + ligand = LigandInfo( + resname="HEM", + chain_id="A", + resnum=1, + pdb_lines=[ + "HETATM 1 FE HEM A 1 0.0 0.0 0.0", + "HETATM 2 NA HEM A 1 1.0 0.0 0.0", + ], + ) + assert not is_single_atom_ligand(ligand) class TestLigandInfo: From bed3d04d7454156d9d9e76a0f0bb11f6fd586c4a Mon Sep 17 00:00:00 2001 From: delalamo Date: Tue, 13 Jan 2026 09:43:31 -0500 Subject: [PATCH 08/22] Use try-except imports with helpful error messages for conda deps The ligand libraries (openmmforcefields, openff-toolkit, rdkit) are only available via conda-forge, not PyPI. Instead of lazy imports, use top-level try-except blocks with clear ImportError messages that tell users exactly how to install the missing dependencies: - ligand_utils.py: Check for openff-toolkit and rdkit at import time - relaxer.py: Check for openmmforcefields at import time - Both provide clear conda install commands with version numbers - pyproject.toml: Updated comment with full conda install command - cli.py: Removed optional dependency check (now handled at runtime) Co-Authored-By: Claude Opus 4.5 --- pyproject.toml | 14 ++---------- src/graphrelax/cli.py | 14 ++---------- src/graphrelax/ligand_utils.py | 42 ++++++++++++++++++++++++++++------ src/graphrelax/relaxer.py | 31 +++++++++++++++++-------- 4 files changed, 61 insertions(+), 40 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 266e8a0..2ee2d87 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -34,12 +34,8 @@ dependencies = [ "openmm", ] -# Note: pdbfixer is not on PyPI, install via conda: -# conda install -c conda-forge pdbfixer -# For ligand support in unconstrained minimization: -# pip install graphrelax[ligands] -# or: -# conda install -c conda-forge openmmforcefields openff-toolkit rdkit +# Note: Some dependencies are only available via conda-forge: +# conda install -c conda-forge pdbfixer openmmforcefields openff-toolkit rdkit [project.urls] Homepage = "https://github.com/delalamo/GraphRelax" @@ -67,12 +63,6 @@ test = [ "pytest>=7.0", "pytest-cov>=4.1", ] -ligands = [ - # For ligand parameterization in unconstrained minimization - "openmmforcefields>=0.13.0", - "openff-toolkit>=0.14.0", - "rdkit>=2023.09.1", -] [project.scripts] graphrelax = "graphrelax.cli:main" diff --git a/src/graphrelax/cli.py b/src/graphrelax/cli.py index 0a66860..57d25fc 100644 --- a/src/graphrelax/cli.py +++ b/src/graphrelax/cli.py @@ -221,8 +221,7 @@ def create_parser() -> argparse.ArgumentParser: action="store_true", help=( "Include ligands in unconstrained minimization using " - "openmmforcefields. Requires: pip install openmmforcefields " - "openff-toolkit. If not specified with ligands present, " + "openmmforcefields. If not specified with ligands present, " "--constrained-minimization is required." ), ) @@ -345,20 +344,11 @@ def main(args=None) -> int: ) if has_ligands and uses_relaxation: if opts.include_ligands: - # Verify openmmforcefields is available - try: - import openmmforcefields # noqa: F401 - except ImportError: - logger.error( - "The --include-ligands flag requires openmmforcefields. " - "Install with: pip install openmmforcefields openff-toolkit" - ) - return 1 logger.info("Ligand support enabled via openmmforcefields") elif not opts.constrained_minimization: logger.error( "Input PDB contains ligands (HETATM records). Use one of:\n" - " 1. --include-ligands (requires openmmforcefields)\n" + " 1. --include-ligands\n" " 2. --constrained-minimization" ) return 1 diff --git a/src/graphrelax/ligand_utils.py b/src/graphrelax/ligand_utils.py index 0c5d702..a0193fa 100644 --- a/src/graphrelax/ligand_utils.py +++ b/src/graphrelax/ligand_utils.py @@ -5,8 +5,37 @@ from dataclasses import dataclass from typing import Dict, List, Optional, Tuple +from openmm import app as openmm_app + +# Ligand parameterization dependencies (conda-forge only) +# conda install -c conda-forge openff-toolkit=0.14.0 rdkit=2023.09.1 +try: + from openff.toolkit import Molecule + from rdkit import Chem + from rdkit.Chem import AllChem + + LIGAND_DEPS_AVAILABLE = True +except ImportError: + LIGAND_DEPS_AVAILABLE = False + Molecule = None + Chem = None + AllChem = None + logger = logging.getLogger(__name__) + +def _check_ligand_deps(): + """Raise ImportError if ligand dependencies are not available.""" + if not LIGAND_DEPS_AVAILABLE: + raise ImportError( + "Ligand parameterization requires openff-toolkit and rdkit.\n" + "These packages are only available via conda-forge:\n\n" + " conda install -c conda-forge " + "openff-toolkit=0.14.0 rdkit=2023.09.1\n\n" + "See the README for full installation instructions." + ) + + # Water residue names to exclude from ligand processing WATER_RESIDUES = {"HOH", "WAT", "SOL", "TIP3", "TIP4", "SPC"} @@ -110,14 +139,19 @@ def create_openff_molecule(ligand: LigandInfo, smiles: Optional[str] = None): 3. From PDB coordinates via RDKit bond perception 4. From direct OpenFF PDB parsing + Requires openff-toolkit and rdkit (install via conda-forge). + Args: ligand: LigandInfo with PDB coordinates smiles: Optional SMILES string (overrides automatic detection) Returns: openff.toolkit.Molecule + + Raises: + ImportError: If openff-toolkit or rdkit are not installed. """ - from openff.toolkit import Molecule + _check_ligand_deps() # 1. User-provided SMILES takes precedence if smiles: @@ -168,10 +202,6 @@ def create_openff_molecule(ligand: LigandInfo, smiles: Optional[str] = None): def _create_molecule_via_rdkit(pdb_block: str): """Create OpenFF Molecule via RDKit from PDB block.""" - from openff.toolkit import Molecule - from rdkit import Chem - from rdkit.Chem import AllChem - # Parse PDB with RDKit mol = Chem.MolFromPDBBlock(pdb_block, removeHs=False, sanitize=False) @@ -203,8 +233,6 @@ def ligand_pdb_to_topology(ligand: LigandInfo): Returns: Tuple of (topology, positions) """ - from openmm import app as openmm_app - pdb_block = "\n".join(ligand.pdb_lines) + "\nEND\n" pdb_file = openmm_app.PDBFile(io.StringIO(pdb_block)) return pdb_file.topology, pdb_file.positions diff --git a/src/graphrelax/relaxer.py b/src/graphrelax/relaxer.py index 3373beb..fd68b03 100644 --- a/src/graphrelax/relaxer.py +++ b/src/graphrelax/relaxer.py @@ -10,6 +10,7 @@ from openmm import Platform from openmm import app as openmm_app from openmm import openmm, unit +from pdbfixer import PDBFixer from graphrelax.chain_gaps import ( detect_chain_gaps, @@ -25,13 +26,27 @@ ligand_pdb_to_topology, ) -# Check if openmmforcefields is available for ligand support +# openmmforcefields is only available via conda-forge +# Install with: conda install -c conda-forge openmmforcefields=0.13.0 try: from openmmforcefields.generators import SystemGenerator OPENMMFF_AVAILABLE = True except ImportError: OPENMMFF_AVAILABLE = False + SystemGenerator = None + + +def _check_openmmff(): + """Raise ImportError if openmmforcefields is not available.""" + if not OPENMMFF_AVAILABLE: + raise ImportError( + "Ligand support requires openmmforcefields.\n" + "This package is only available via conda-forge:\n\n" + " conda install -c conda-forge openmmforcefields=0.13.0\n\n" + "See the README for full installation instructions." + ) + # Add vendored LigandMPNN to path for OpenFold imports # Must happen before importing from openfold @@ -187,11 +202,6 @@ def _relax_unconstrained( ) if has_ligands and self.config.include_ligands: - if not OPENMMFF_AVAILABLE: - raise ImportError( - "openmmforcefields is required for ligand support. " - "Install with: pip install openmmforcefields openff-toolkit" - ) return self._relax_unconstrained_with_ligands(pdb_string) ENERGY = unit.kilocalories_per_mole @@ -219,8 +229,6 @@ def _relax_unconstrained( protein_pdb = "\n".join(atom_lines) + "\nEND\n" # Use pdbfixer to add missing atoms and terminal groups - from pdbfixer import PDBFixer - fixer = PDBFixer(pdbfile=io.StringIO(protein_pdb)) fixer.findMissingResidues() fixer.findMissingAtoms() @@ -303,13 +311,18 @@ def _relax_unconstrained_with_ligands( The protein is processed separately with pdbfixer, then combined with parameterized ligands for minimization. + Requires openmmforcefields (install via conda-forge). + Args: pdb_string: PDB file contents as string Returns: Tuple of (relaxed_pdb_string, debug_info, violations) + + Raises: + ImportError: If openmmforcefields is not installed. """ - from pdbfixer import PDBFixer + _check_openmmff() ENERGY = unit.kilocalories_per_mole LENGTH = unit.angstroms From 2b3dae358241c2045a7510a48bf49c9c952d160e Mon Sep 17 00:00:00 2001 From: delalamo Date: Tue, 13 Jan 2026 10:32:04 -0500 Subject: [PATCH 09/22] Add crystallography artifact removal and mamba installation docs - Add artifacts.py with comprehensive artifact detection for buffers, cryoprotectants, detergents, lipids, reducing agents, and halide ions - Auto-remove artifacts by default, preserve biologically relevant ions - Add --keep-all-ligands and --keep-ligand flags to whitelist residues - Update README with mamba/micromamba installation instructions - Add tests for artifact detection and removal Co-Authored-By: Claude Opus 4.5 --- .pre-commit-config.yaml | 2 +- README.md | 194 ++++++++++++++----- environment.yml | 48 +++++ requirements.txt | 49 +++++ src/graphrelax/artifacts.py | 336 +++++++++++++++++++++++++++++++++ src/graphrelax/cli.py | 54 ++++-- src/graphrelax/config.py | 10 +- src/graphrelax/ligand_utils.py | 22 ++- src/graphrelax/pipeline.py | 15 ++ src/graphrelax/relaxer.py | 7 +- tests/test_artifacts.py | 326 ++++++++++++++++++++++++++++++++ 11 files changed, 991 insertions(+), 72 deletions(-) create mode 100644 environment.yml create mode 100644 requirements.txt create mode 100644 src/graphrelax/artifacts.py create mode 100644 tests/test_artifacts.py diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 362fbc1..569ecd2 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -31,7 +31,7 @@ repos: - "--enable=C0415" - "--score=n" name: pylint (no lazy imports) - exclude: "^(src/graphrelax/__init__.py|src/graphrelax/cli.py|src/graphrelax/designer.py|src/graphrelax/relaxer.py|src/graphrelax/utils.py|src/graphrelax/ligand_utils.py|src/graphrelax/LigandMPNN/|tests/)" + exclude: "^(src/graphrelax/__init__.py|src/graphrelax/cli.py|src/graphrelax/designer.py|src/graphrelax/relaxer.py|src/graphrelax/utils.py|src/graphrelax/ligand_utils.py|src/graphrelax/pipeline.py|src/graphrelax/LigandMPNN/|tests/)" - repo: https://github.com/pre-commit/mirrors-prettier rev: v4.0.0-alpha.8 diff --git a/README.md b/README.md index 6b0801e..7e8558b 100644 --- a/README.md +++ b/README.md @@ -6,13 +6,62 @@ GraphRelax combines **LigandMPNN** (for sequence design and side-chain packing) ## Installation +### Quick Start (Recommended) + +The easiest way to install GraphRelax with all dependencies is using mamba (much faster than conda): + +```bash +# Install mamba if you don't have it (or use micromamba) +conda install -n base -c conda-forge mamba + +# Create environment from file +mamba env create -f environment.yml +conda activate graphrelax + +# Install GraphRelax +pip install -e . +``` + +Or with **micromamba** (standalone, no conda required): + +```bash +# Install micromamba (see https://mamba.readthedocs.io/en/latest/installation/micromamba-installation.html) +# Then: +micromamba create -f environment.yml +micromamba activate graphrelax +pip install -e . +``` + +This installs all dependencies including ligand support. + +### Standard Installation + +For basic usage without ligand support: + ```bash -# Install from PyPI pip install graphrelax ``` LigandMPNN model weights (~40MB) are downloaded automatically on first run. +### Full Installation (with Ligand Support) + +Some dependencies are only available via conda-forge and must be installed before pip: + +```bash +# Step 1: Install conda-forge dependencies (use mamba for speed) +mamba install -c conda-forge \ + pdbfixer \ + openmmforcefields>=0.13.0 \ + openff-toolkit>=0.14.0 \ + rdkit>=2023.09.1 + +# Step 2: Install GraphRelax +pip install graphrelax +``` + +> **Note:** `mamba` is a drop-in replacement for `conda` that's 10-100x faster. Install it with `conda install -n base -c conda-forge mamba`, or use `micromamba` as a standalone tool. + ### Development Installation ```bash @@ -20,31 +69,39 @@ LigandMPNN model weights (~40MB) are downloaded automatically on first run. git clone https://github.com/your-username/GraphRelax.git cd GraphRelax -# Install in editable mode +# Option A: Use environment.yml with mamba (recommended) +mamba env create -f environment.yml +conda activate graphrelax pip install -e . -``` -### Optional: Constrained Minimization - -If you want to use `--constrained-minimization` mode (AlphaFold-style relaxation with position restraints and violation checking), you also need pdbfixer: - -```bash -# pdbfixer is only available via conda-forge, not PyPI -conda install -c conda-forge pdbfixer +# Option B: Manual installation +mamba install -c conda-forge pdbfixer openmmforcefields openff-toolkit rdkit +pip install -e . ``` -### Platform-specific Installation +### Dependencies -```bash -# CPU-only (smaller install, no GPU dependencies) -pip install "graphrelax[cpu]" +#### Pip packages (installed automatically) -# With CUDA 11 GPU support -pip install "graphrelax[cuda11]" +| Package | Version | Purpose | +| -------------- | ------- | ------------------------ | +| torch | >= 2.0 | Neural network inference | +| numpy | < 2 | Array operations | +| openmm | latest | Molecular dynamics | +| biopython | latest | Structure parsing | +| prody | latest | Protein analysis | +| absl-py | latest | Configuration | +| ml-collections | latest | Configuration | +| dm-tree | latest | Nested data structures | -# With CUDA 12 GPU support -pip install "graphrelax[cuda12]" -``` +#### Conda-forge packages (manual installation required) + +| Package | Version | Purpose | Required for | +| ----------------- | ------------ | --------------------------- | ---------------------------- | +| pdbfixer | latest | Structure preparation | `--constrained-minimization` | +| openmmforcefields | >= 0.13.0 | Small molecule force fields | `--include-ligands` | +| openff-toolkit | >= 0.14.0 | OpenFF Molecule creation | `--include-ligands` | +| rdkit | >= 2023.09.1 | Bond perception from PDB | `--include-ligands` | ### Docker @@ -70,24 +127,6 @@ docker build -t graphrelax . docker run --rm graphrelax --help ``` -### Dependencies - -Core dependencies (installed automatically via pip): - -- Python >= 3.9 -- PyTorch >= 2.0 -- NumPy < 2.0 (PyTorch <2.5 is incompatible with NumPy 2.x) -- OpenMM -- BioPython -- ProDy -- dm-tree -- absl-py -- ml-collections - -Optional (for `--constrained-minimization` only): - -- pdbfixer (conda-forge only, not on PyPI) - ## Features - **FastRelax-like protocol**: Alternate between side-chain repacking and energy minimization @@ -165,23 +204,77 @@ graphrelax -i input.pdb -o relaxed.pdb --constrained-minimization --stiffness 5. | Default (unconstrained) | No | No | Fast | No | | `--constrained-minimization` | Yes (harmonic) | Yes | Slower | Yes | -**Important:** When your input PDB contains ligands or other non-standard residues (HETATM records other than water), you **must** use `--constrained-minimization`. The unconstrained mode uses AMBER force field parameters which don't include templates for non-standard molecules. Constrained mode uses OpenFold's AmberRelaxation which can handle ligands properly. +### Crystallography Artifact Removal + +By default, GraphRelax **automatically removes** common crystallography and cryo-EM artifacts from input structures: + +- **Buffers**: SO4, PO4, CIT, ACT, MES, HEPES, Tris, etc. +- **Cryoprotectants**: GOL (glycerol), EDO (ethylene glycol), MPD, PEG variants, DMSO +- **Detergents**: DDM, OG, LDAO, CHAPS, SDS, etc. +- **Lipids/Fatty acids**: PLM (palmitic), MYR (myristic), OLA (oleic), etc. +- **Reducing agents**: BME, DTT +- **Halide ions**: CL, BR, IOD + +**Biologically relevant metal ions** (ZN, MG, CA, FE, MN, CU, etc.) are **preserved**. + +```bash +# Artifacts are removed by default +graphrelax -i crystal_structure.pdb -o relaxed.pdb + +# Keep all HETATM records including artifacts +graphrelax -i crystal_structure.pdb -o relaxed.pdb --keep-all-ligands + +# Keep specific artifacts that you need +graphrelax -i crystal_structure.pdb -o relaxed.pdb --keep-ligand GOL --keep-ligand SO4 +``` ### Working with Ligands -When designing proteins with bound ligands (e.g., heme, cofactors, small molecules), use `ligand_mpnn` model type with constrained minimization: +Biologically relevant ligands (cofactors, substrates, inhibitors) are **auto-detected** and parameterized using openmmforcefields. No special flags are needed. ```bash -# Design around a ligand +# Ligands are automatically included and parameterized +graphrelax -i protein_with_ligand.pdb -o designed.pdb --design + +# Choose a specific force field for ligands graphrelax -i protein_with_ligand.pdb -o designed.pdb \ - --design --model-type ligand_mpnn --constrained-minimization + --ligand-forcefield gaff-2.11 -# Repack side chains around a ligand -graphrelax -i protein_with_ligand.pdb -o repacked.pdb \ - --relax --model-type ligand_mpnn --constrained-minimization +# Provide SMILES for unknown ligands +graphrelax -i protein_with_ligand.pdb -o designed.pdb \ + --ligand-smiles "LIG:c1ccccc1" + +# Strip all ligands (protein-only processing) +graphrelax -i protein_with_ligand.pdb -o designed.pdb --ignore-ligands +``` + +**Requires:** `conda install -c conda-forge openmmforcefields openff-toolkit rdkit` + +For design around ligands, use `ligand_mpnn` model type: + +```bash +graphrelax -i protein_with_ligand.pdb -o designed.pdb \ + --design --model-type ligand_mpnn +``` + +#### Ligand Force Field Options + +| Force Field | Flag | Description | +| --------------- | ------------------------------------ | -------------------------- | +| OpenFF Sage 2.0 | `--ligand-forcefield openff-2.0.0` | Modern, accurate (default) | +| GAFF 2.11 | `--ligand-forcefield gaff-2.11` | Well-tested, robust | +| Espaloma 0.3 | `--ligand-forcefield espaloma-0.3.0` | ML-based, fast | + +#### Alternative: Constrained Minimization + +Use `--constrained-minimization` for position-restrained minimization (does not require openmmforcefields): + +```bash +graphrelax -i protein_with_ligand.pdb -o designed.pdb \ + --design --constrained-minimization ``` -**Note:** If you attempt to use unconstrained minimization with a PDB containing ligands, GraphRelax will exit with an error message directing you to use `--constrained-minimization`. +**Requires:** `conda install -c conda-forge pdbfixer` ### Resfile Format @@ -247,13 +340,22 @@ Relaxation options: --constrained-minimization Use constrained minimization with position restraints (AlphaFold-style). Default is unconstrained. Requires pdbfixer. - **Required when input PDB contains ligands.** --stiffness K Restraint stiffness in kcal/mol/A^2 (default: 10.0) Only applies to constrained minimization. --max-iterations N Max L-BFGS iterations, 0=unlimited (default: 0) + --ignore-ligands Strip all ligands before processing. By default, + ligands are auto-detected and parameterized. + --ligand-forcefield FF Force field for ligands: openff-2.0.0 (default), + gaff-2.11, or espaloma-0.3.0 + --ligand-smiles RES:SMILES Provide SMILES for a ligand residue. Can be + used multiple times for multiple ligands. Input preprocessing: --keep-waters Keep water molecules in input (default: removed) + --keep-all-ligands Keep all HETATM records including crystallography + artifacts. By default, common artifacts are removed. + --keep-ligand RESNAME Keep specific ligand residue (can be used multiple + times). Example: --keep-ligand GOL --keep-ligand SO4 Scoring: --scorefile FILE Output scorefile with energy terms diff --git a/environment.yml b/environment.yml new file mode 100644 index 0000000..1398256 --- /dev/null +++ b/environment.yml @@ -0,0 +1,48 @@ +# GraphRelax Conda Environment +# ============================ +# +# Creates a complete environment with all dependencies for GraphRelax, +# including ligand support. +# +# Usage (mamba is 10-100x faster than conda): +# mamba env create -f environment.yml +# conda activate graphrelax +# pip install -e . +# +# Or with micromamba (standalone, no conda required): +# micromamba create -f environment.yml +# micromamba activate graphrelax +# pip install -e . +# +# For GPU support, also install CUDA-enabled PyTorch: +# pip install torch --index-url https://download.pytorch.org/whl/cu118 + +name: graphrelax +channels: + - conda-forge + - defaults +dependencies: + # Python version + - python>=3.9 + + # Conda-forge only packages (not on PyPI) + - pdbfixer + - openmmforcefields>=0.13.0 + - openff-toolkit>=0.14.0 + - rdkit>=2023.09.1 + + # Core dependencies (also available via pip) + - openmm + - numpy<2 + - biopython + - prody + + # pip dependencies + - pip + - pip: + - torch>=2.0 + - absl-py + - ml-collections + - dm-tree + # Install graphrelax itself + - -e . diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..3c80844 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,49 @@ +# GraphRelax Requirements +# ====================== +# +# This file lists ALL dependencies needed to run GraphRelax with full +# functionality, including ligand support. +# +# INSTALLATION ORDER: +# 1. First install conda-forge packages (not available on PyPI) +# 2. Then install pip packages +# +# Quick install (recommended): +# conda install -c conda-forge pdbfixer openmmforcefields openff-toolkit rdkit +# pip install -r requirements.txt +# +# Or use the environment.yml file for a complete conda environment. + +# ============================================================================= +# CONDA-FORGE ONLY (must be installed via conda before pip) +# ============================================================================= +# These packages are NOT on PyPI and must be installed via conda-forge: +# +# conda install -c conda-forge \ +# pdbfixer \ +# openmmforcefields>=0.13.0 \ +# openff-toolkit>=0.14.0 \ +# rdkit>=2023.09.1 +# +# pdbfixer: Required for unconstrained minimization and ligand support +# openmmforcefields: Required for --include-ligands (ligand force fields) +# openff-toolkit: Required for --include-ligands (molecule parameterization) +# rdkit: Required for --include-ligands (bond perception from PDB) + +# ============================================================================= +# PIP PACKAGES (install after conda packages) +# ============================================================================= + +# Core dependencies +torch>=2.0 +numpy<2 # PyTorch <2.5 is incompatible with NumPy 2.x +openmm +biopython +prody +absl-py +ml-collections +dm-tree + +# Development/testing (optional) +# pytest>=7.0 +# pytest-cov>=4.1 diff --git a/src/graphrelax/artifacts.py b/src/graphrelax/artifacts.py new file mode 100644 index 0000000..f97e48c --- /dev/null +++ b/src/graphrelax/artifacts.py @@ -0,0 +1,336 @@ +"""Constants and utilities for crystallography artifact detection and removal. + +Common crystallography and cryo-EM artifacts (buffers, cryoprotectants, +detergents, lipids) are stripped by default during preprocessing. +Use --keep-all-ligands to preserve them. +""" + +import logging +from collections import defaultdict +from typing import Dict, Optional, Set, Tuple + +logger = logging.getLogger(__name__) + +# ============================================================================= +# Artifact Categories +# ============================================================================= + +# Buffer components commonly found in crystallization conditions +BUFFER_ARTIFACTS = frozenset( + { + # Sulfate/Phosphate + "SO4", + "PO4", + "PO3", + "PI", + "2HP", + "2PO", + "SPO", + "SPH", + # Organic acids + "CIT", + "FLC", + "ACT", + "ACE", + "FMT", + "FOR", + "NO3", + "AZI", + # HEPES, MES, Tris, etc. + "MES", + "EPE", + "TRS", + "CAC", + "BIS", + "HEZ", + "MOH", + "BTB", + "TAM", + "MRD", + "PGO", + "144", + "IPS", + # Malonate, Tartrate, etc. + "MLI", + "TAR", + "MAL", + "SUC", + "FUM", + # Borate + "BO3", + "BO4", + # Imidazole (from His-tag purification) + "IMD", + "1MZ", + } +) + +# Cryoprotectants used for flash-cooling crystals +CRYOPROTECTANT_ARTIFACTS = frozenset( + { + # Glycerol and glycols + "GOL", + "EDO", + "EGL", + "PGR", + "PGQ", + "GLC", + # MPD (2-Methyl-2,4-pentanediol) + "MPD", + "1PG", + "PDO", + # PEG variants (polyethylene glycol fragments) + "PEG", + "PGE", + "PG4", + "1PE", + "P6G", + "P33", + "PE4", + "PG0", + "2PE", + "PEU", + "PE8", + "PE5", + "XPE", + "12P", + "15P", + "PG5", + "PG6", + "PEE", + "PE3", + "P4G", + "P2E", + # DMSO + "DMS", + # Isopropanol + "IPA", + } +) + +# Detergents used for membrane protein crystallization +DETERGENT_ARTIFACTS = frozenset( + { + # Maltosides (DDM, DM, UDM, etc.) + "LMT", + "MLA", + "BMA", + "TRE", + "DDM", + "DXM", + # Glucosides (OG, NG, etc.) + "BOG", + "BGC", + "NDG", + "HTG", + "OGA", + "NGS", + # LDAO (Lauryldimethylamine-N-oxide) + "LDA", + "DAO", + # CHAPS/CHAPSO + "CPS", + "CHT", + "SDS", + "CHP", + # Triton, digitonin + "TRT", + "T3A", + "D10", + "D12", + "DGT", + # LMNG, cymals + "MNG", + "CYC", + "LMG", + # C12E8/9 polyoxyethylene + "CE9", + "C8E", + "C10", + "C12", + # Octyl glucoside + "OLC", + } +) + +# Lipids and fatty acids (from LCP crystallization or membrane proteins) +LIPID_ARTIFACTS = frozenset( + { + # Common fatty acids + "PLM", + "MYR", + "OLA", + "STE", + "PAL", + "LNL", + "ARA", + "DCA", + "UND", + "MYS", + "MYA", + "LNO", + "EOA", + "PEF", + # Monoolein (lipidic cubic phase crystallization) + "OLB", + "9OL", + "MPG", + "OLI", + # Phospholipids and fragments + "PC", + "PE", + "PG", + "PS", + "PLC", + "EPH", + "CDL", + # Cholesterol + "CLR", + "CHO", + } +) + +# Reducing agents from protein preparation +REDUCING_AGENT_ARTIFACTS = frozenset( + { + "BME", + "DTT", + "DTU", + "TCE", + "TRO", + "GSH", + } +) + +# Halide ions (often crystallization additives, not biologically relevant) +HALIDE_ARTIFACTS = frozenset( + { + "CL", + "BR", + "IOD", + "F", + } +) + +# Unknown/placeholder atoms +UNKNOWN_ARTIFACTS = frozenset( + { + "UNX", + "UNL", + "UNK", + "DUM", + } +) + +# ============================================================================= +# Combined Sets +# ============================================================================= + +# Master set of all artifacts to remove by default +CRYSTALLOGRAPHY_ARTIFACTS = ( + BUFFER_ARTIFACTS + | CRYOPROTECTANT_ARTIFACTS + | DETERGENT_ARTIFACTS + | LIPID_ARTIFACTS + | REDUCING_AGENT_ARTIFACTS + | HALIDE_ARTIFACTS + | UNKNOWN_ARTIFACTS +) + +# Biologically relevant ions - NOT stripped by default +# These are often structurally/functionally important +BIOLOGICALLY_RELEVANT_IONS = frozenset( + { + "ZN", + "MG", + "CA", + "FE", + "FE2", + "MN", + "CU", + "CO", + "NI", + "MO", + "NA", + "K", # Often biologically relevant in channels/pumps + } +) + +# Water residues (handled separately by remove_waters) +WATER_RESIDUES = frozenset({"HOH", "WAT", "SOL", "TIP3", "TIP4", "SPC"}) + + +# ============================================================================= +# Removal Functions +# ============================================================================= + + +def remove_artifacts( + pdb_string: str, + keep_residues: Optional[Set[str]] = None, +) -> Tuple[str, Dict[str, int]]: + """ + Remove crystallography artifacts from PDB string. + + Artifacts are identified by their residue name (columns 17-20 in PDB + format). HETATM records with residue names in CRYSTALLOGRAPHY_ARTIFACTS + are removed unless they appear in keep_residues. + + Args: + pdb_string: PDB file contents as string + keep_residues: Set of residue names to preserve (whitelist) + + Returns: + Tuple of: + - filtered_pdb: PDB string with artifacts removed + - removed_counts: Dict mapping residue name to atom count + """ + if keep_residues is None: + keep_residues = set() + + # Normalize whitelist to uppercase + keep_residues = {r.upper() for r in keep_residues} + + # Residues to remove = artifacts minus whitelist + to_remove = CRYSTALLOGRAPHY_ARTIFACTS - keep_residues + + kept_lines = [] + removed_counts = defaultdict(int) + + for line in pdb_string.split("\n"): + if line.startswith("HETATM"): + resname = line[17:20].strip().upper() + if resname in to_remove: + removed_counts[resname] += 1 + continue + kept_lines.append(line) + + filtered_pdb = "\n".join(kept_lines) + + return filtered_pdb, dict(removed_counts) + + +def is_artifact(resname: str) -> bool: + """ + Check if a residue name is a known crystallography artifact. + + Args: + resname: Three-letter residue code + + Returns: + True if the residue is in CRYSTALLOGRAPHY_ARTIFACTS + """ + return resname.upper() in CRYSTALLOGRAPHY_ARTIFACTS + + +def is_biologically_relevant_ion(resname: str) -> bool: + """ + Check if a residue name is a biologically relevant ion. + + Args: + resname: Three-letter residue code + + Returns: + True if the residue is in BIOLOGICALLY_RELEVANT_IONS + """ + return resname.upper() in BIOLOGICALLY_RELEVANT_IONS diff --git a/src/graphrelax/cli.py b/src/graphrelax/cli.py index 57d25fc..4653f5b 100644 --- a/src/graphrelax/cli.py +++ b/src/graphrelax/cli.py @@ -217,12 +217,12 @@ def create_parser() -> argparse.ArgumentParser: ), ) relax_group.add_argument( - "--include-ligands", + "--ignore-ligands", action="store_true", help=( - "Include ligands in unconstrained minimization using " - "openmmforcefields. If not specified with ligands present, " - "--constrained-minimization is required." + "Strip all ligands (HETATM records) before processing. " + "By default, ligands are auto-detected and parameterized " + "using openmmforcefields." ), ) relax_group.add_argument( @@ -262,6 +262,26 @@ def create_parser() -> argparse.ArgumentParser: action="store_true", help="Keep water molecules in input (default: waters are removed)", ) + preprocess_group.add_argument( + "--keep-all-ligands", + action="store_true", + help=( + "Keep all HETATM records including crystallography artifacts " + "(buffers, cryoprotectants, detergents, lipids). " + "By default, common artifacts are removed." + ), + ) + preprocess_group.add_argument( + "--keep-ligand", + type=str, + action="append", + metavar="RESNAME", + help=( + "Keep a specific ligand residue that would otherwise be stripped " + "as an artifact. Can be used multiple times. " + "Example: --keep-ligand GOL --keep-ligand SO4" + ), + ) # General options general_group = parser.add_argument_group("General options") @@ -336,22 +356,19 @@ def main(args=None) -> int: input_format = detect_format(opts.input) has_ligands = _check_for_ligands(opts.input, input_format) - # Validate: ligand handling requires --include-ligands or --constrained + # Log ligand handling info uses_relaxation = mode in ( PipelineMode.RELAX, PipelineMode.NO_REPACK, PipelineMode.DESIGN, ) if has_ligands and uses_relaxation: - if opts.include_ligands: - logger.info("Ligand support enabled via openmmforcefields") - elif not opts.constrained_minimization: - logger.error( - "Input PDB contains ligands (HETATM records). Use one of:\n" - " 1. --include-ligands\n" - " 2. --constrained-minimization" - ) - return 1 + if opts.ignore_ligands: + logger.info("Ligands will be stripped (--ignore-ligands)") + elif opts.constrained_minimization: + logger.info("Ligands handled via constrained minimization") + else: + logger.info("Ligands auto-detected, using openmmforcefields") logger.info(f"Running GraphRelax in {mode.value} mode") logger.info(f"Input: {opts.input}") @@ -383,11 +400,16 @@ def main(args=None) -> int: max_iterations=opts.max_iterations, constrained=opts.constrained_minimization, split_chains_at_gaps=not opts.no_split_gaps, - include_ligands=opts.include_ligands, + ignore_ligands=opts.ignore_ligands, ligand_forcefield=opts.ligand_forcefield, ligand_smiles=ligand_smiles, ) + # Build keep_residues set from --keep-ligand flags + keep_residues = set() + if opts.keep_ligand: + keep_residues = {r.upper() for r in opts.keep_ligand} + pipeline_config = PipelineConfig( mode=mode, n_iterations=opts.n_iter, @@ -395,6 +417,8 @@ def main(args=None) -> int: scorefile=opts.scorefile, verbose=opts.verbose, remove_waters=not opts.keep_waters, + remove_artifacts=not opts.keep_all_ligands, + keep_residues=keep_residues, design=design_config, relax=relax_config, ) diff --git a/src/graphrelax/config.py b/src/graphrelax/config.py index c00f37a..909f43b 100644 --- a/src/graphrelax/config.py +++ b/src/graphrelax/config.py @@ -3,7 +3,7 @@ from dataclasses import dataclass, field from enum import Enum from pathlib import Path -from typing import Dict, Literal, Optional +from typing import Dict, Literal, Optional, Set LigandForceField = Literal["openff-2.0.0", "gaff-2.11", "espaloma-0.3.0"] @@ -46,10 +46,8 @@ class RelaxConfig: split_chains_at_gaps: bool = True # Split chains at gaps to prevent closure # GPU is auto-detected and used when available - # Ligand support options - include_ligands: bool = ( - False # Enable ligand parameterization in unconstrained - ) + # Ligand support options (ligands are auto-detected) + ignore_ligands: bool = False # If True, strip all ligands before processing ligand_forcefield: LigandForceField = ( "openff-2.0.0" # Force field for ligands ) @@ -68,5 +66,7 @@ class PipelineConfig: scorefile: Optional[Path] = None # If set, write scores to this file verbose: bool = False remove_waters: bool = True # Remove water molecules from input + remove_artifacts: bool = True # Remove crystallography artifacts by default + keep_residues: Set[str] = field(default_factory=set) # Whitelist residues design: DesignConfig = field(default_factory=DesignConfig) relax: RelaxConfig = field(default_factory=RelaxConfig) diff --git a/src/graphrelax/ligand_utils.py b/src/graphrelax/ligand_utils.py index a0193fa..6c13ef8 100644 --- a/src/graphrelax/ligand_utils.py +++ b/src/graphrelax/ligand_utils.py @@ -51,22 +51,36 @@ class LigandInfo: smiles: Optional[str] = None -def extract_ligands_from_pdb(pdb_string: str) -> Tuple[str, List[LigandInfo]]: +def extract_ligands_from_pdb( + pdb_string: str, + exclude_artifacts: bool = True, +) -> Tuple[str, List[LigandInfo]]: """ Separate protein ATOM records from ligand HETATM records. + By default, crystallography artifacts (buffers, cryoprotectants, detergents, + lipids) are excluded from the ligand list since they cannot be meaningfully + parameterized for minimization. + Args: pdb_string: Full PDB string with protein and ligands + exclude_artifacts: If True, skip known crystallography artifacts Returns: Tuple of (protein_only_pdb, list_of_ligand_info) """ + # Import here to avoid circular imports + if exclude_artifacts: + from graphrelax.artifacts import CRYSTALLOGRAPHY_ARTIFACTS + else: + CRYSTALLOGRAPHY_ARTIFACTS = set() + protein_lines = [] ligand_lines_by_residue = {} for line in pdb_string.split("\n"): if line.startswith("HETATM"): - resname = line[17:20].strip() + resname = line[17:20].strip().upper() chain_id = line[21] if len(line) > 21 else " " try: resnum = int(line[22:26].strip()) @@ -77,6 +91,10 @@ def extract_ligands_from_pdb(pdb_string: str) -> Tuple[str, List[LigandInfo]]: if resname in WATER_RESIDUES: continue + # Skip known artifacts (they won't be parameterized) + if resname in CRYSTALLOGRAPHY_ARTIFACTS: + continue + key = (chain_id, resnum, resname) if key not in ligand_lines_by_residue: ligand_lines_by_residue[key] = [] diff --git a/src/graphrelax/pipeline.py b/src/graphrelax/pipeline.py index 7e50cc1..e03ea28 100644 --- a/src/graphrelax/pipeline.py +++ b/src/graphrelax/pipeline.py @@ -182,6 +182,21 @@ def _run_single_output( if removed > 0: logger.info(f"Removed {removed} water-related lines from input") + # Remove crystallography artifacts if requested + if self.config.remove_artifacts: + from graphrelax.artifacts import remove_artifacts + + current_structure, removed_counts = remove_artifacts( + current_structure, + keep_residues=self.config.keep_residues, + ) + if removed_counts: + total = sum(removed_counts.values()) + artifacts = ", ".join( + f"{k}({v})" for k, v in sorted(removed_counts.items()) + ) + logger.info(f"Removed {total} artifact atoms: {artifacts}") + # Convert to PDB format for internal processing if needed current_pdb = ensure_pdb_format(current_structure, input_pdb) diff --git a/src/graphrelax/relaxer.py b/src/graphrelax/relaxer.py index fd68b03..41f46e9 100644 --- a/src/graphrelax/relaxer.py +++ b/src/graphrelax/relaxer.py @@ -185,8 +185,8 @@ def _relax_unconstrained( No position restraints, no violation checking, uses OpenMM defaults. This is the default minimization mode. - If config.include_ligands is True and ligands are present, uses - openmmforcefields for ligand parameterization. + Ligands are auto-detected. If present and ignore_ligands is False, + uses openmmforcefields for ligand parameterization. Args: pdb_string: PDB file contents as string @@ -201,7 +201,8 @@ def _relax_unconstrained( for line in pdb_string.split("\n") ) - if has_ligands and self.config.include_ligands: + # Auto-detect ligands and use openmmforcefields unless ignore_ligands + if has_ligands and not self.config.ignore_ligands: return self._relax_unconstrained_with_ligands(pdb_string) ENERGY = unit.kilocalories_per_mole diff --git a/tests/test_artifacts.py b/tests/test_artifacts.py new file mode 100644 index 0000000..ded13bb --- /dev/null +++ b/tests/test_artifacts.py @@ -0,0 +1,326 @@ +"""Tests for crystallography artifact detection and removal.""" + +from graphrelax.artifacts import ( + BIOLOGICALLY_RELEVANT_IONS, + BUFFER_ARTIFACTS, + CRYOPROTECTANT_ARTIFACTS, + CRYSTALLOGRAPHY_ARTIFACTS, + DETERGENT_ARTIFACTS, + HALIDE_ARTIFACTS, + LIPID_ARTIFACTS, + REDUCING_AGENT_ARTIFACTS, + WATER_RESIDUES, + is_artifact, + is_biologically_relevant_ion, + remove_artifacts, +) + + +class TestArtifactConstants: + """Tests for artifact constant definitions.""" + + def test_common_buffers_included(self): + """Test that common buffer artifacts are in the set.""" + buffers = ["SO4", "PO4", "CIT", "ACT", "MES", "EPE", "TRS"] + for buf in buffers: + assert ( + buf in BUFFER_ARTIFACTS + ), f"{buf} should be in BUFFER_ARTIFACTS" + + def test_common_cryoprotectants_included(self): + """Test that common cryoprotectants are in the set.""" + cryo = ["GOL", "EDO", "MPD", "PEG", "DMS"] + for c in cryo: + assert ( + c in CRYOPROTECTANT_ARTIFACTS + ), f"{c} should be in CRYOPROTECTANT_ARTIFACTS" + + def test_common_detergents_included(self): + """Test that common detergents are in the set.""" + detergents = ["BOG", "LDA", "SDS"] + for d in detergents: + assert ( + d in DETERGENT_ARTIFACTS + ), f"{d} should be in DETERGENT_ARTIFACTS" + + def test_common_lipids_included(self): + """Test that common lipids/fatty acids are in the set.""" + lipids = ["PLM", "MYR", "OLA", "STE"] + for lip in lipids: + assert lip in LIPID_ARTIFACTS, f"{lip} should be in LIPID_ARTIFACTS" + + def test_reducing_agents_included(self): + """Test that common reducing agents are in the set.""" + agents = ["BME", "DTT"] + for a in agents: + assert ( + a in REDUCING_AGENT_ARTIFACTS + ), f"{a} should be in REDUCING_AGENT_ARTIFACTS" + + def test_halides_included(self): + """Test that halide ions are in the set.""" + halides = ["CL", "BR", "IOD", "F"] + for h in halides: + assert h in HALIDE_ARTIFACTS, f"{h} should be in HALIDE_ARTIFACTS" + + def test_metal_ions_not_in_artifacts(self): + """Test that biologically relevant metal ions are NOT artifacts.""" + metals = ["ZN", "MG", "CA", "FE", "MN", "CU"] + for m in metals: + assert ( + m not in CRYSTALLOGRAPHY_ARTIFACTS + ), f"{m} should NOT be in CRYSTALLOGRAPHY_ARTIFACTS" + + def test_biologically_relevant_ions_defined(self): + """Test that biologically relevant ions are in their own set.""" + ions = ["ZN", "MG", "CA", "FE", "MN", "CU", "CO", "NI"] + for ion in ions: + assert ( + ion in BIOLOGICALLY_RELEVANT_IONS + ), f"{ion} should be in BIOLOGICALLY_RELEVANT_IONS" + + def test_master_set_contains_all_categories(self): + """Test that CRYSTALLOGRAPHY_ARTIFACTS combines all category sets.""" + all_categories = ( + BUFFER_ARTIFACTS + | CRYOPROTECTANT_ARTIFACTS + | DETERGENT_ARTIFACTS + | LIPID_ARTIFACTS + | REDUCING_AGENT_ARTIFACTS + | HALIDE_ARTIFACTS + ) + for artifact in all_categories: + assert ( + artifact in CRYSTALLOGRAPHY_ARTIFACTS + ), f"{artifact} should be in CRYSTALLOGRAPHY_ARTIFACTS" + + def test_water_residues_separate(self): + """Test that water residues are in their own set.""" + waters = ["HOH", "WAT", "SOL"] + for w in waters: + assert w in WATER_RESIDUES, f"{w} should be in WATER_RESIDUES" + # Waters should NOT be in crystallography artifacts + assert ( + w not in CRYSTALLOGRAPHY_ARTIFACTS + ), f"{w} should NOT be in CRYSTALLOGRAPHY_ARTIFACTS" + + +class TestIsArtifact: + """Tests for is_artifact function.""" + + def test_glycerol_is_artifact(self): + """Test that glycerol is detected as artifact.""" + assert is_artifact("GOL") + assert is_artifact("gol") # case insensitive + + def test_sulfate_is_artifact(self): + """Test that sulfate is detected as artifact.""" + assert is_artifact("SO4") + + def test_zinc_is_not_artifact(self): + """Test that zinc is not detected as artifact.""" + assert not is_artifact("ZN") + + def test_heme_is_not_artifact(self): + """Test that heme is not detected as artifact.""" + assert not is_artifact("HEM") + + +class TestIsBiologicallyRelevantIon: + """Tests for is_biologically_relevant_ion function.""" + + def test_zinc_is_relevant(self): + """Test that zinc is detected as relevant.""" + assert is_biologically_relevant_ion("ZN") + assert is_biologically_relevant_ion("zn") # case insensitive + + def test_magnesium_is_relevant(self): + """Test that magnesium is detected as relevant.""" + assert is_biologically_relevant_ion("MG") + + def test_glycerol_is_not_relevant_ion(self): + """Test that glycerol is not a relevant ion.""" + assert not is_biologically_relevant_ion("GOL") + + +class TestRemoveArtifacts: + """Tests for remove_artifacts function.""" + + def test_removes_glycerol(self): + """Test that glycerol atoms are removed.""" + # fmt: off + pdb = ( + "ATOM 1 N ALA A 1 0.0 0.0 0.0 1.00 0.00\n" + "ATOM 2 CA ALA A 1 1.5 0.0 0.0 1.00 0.00\n" + "HETATM 3 O1 GOL A 100 5.0 5.0 5.0 1.00 0.00\n" + "HETATM 4 O2 GOL A 100 6.0 5.0 5.0 1.00 0.00\n" + "END\n" + ) + # fmt: on + result, removed = remove_artifacts(pdb) + + assert "GOL" not in result + assert "GOL" in removed + assert removed["GOL"] == 2 + assert "ATOM" in result # Protein preserved + + def test_removes_sulfate(self): + """Test that sulfate atoms are removed.""" + # fmt: off + pdb = ( + "ATOM 1 N ALA A 1 0.0 0.0 0.0 1.00 0.00\n" + "HETATM 2 S SO4 A 200 5.0 5.0 5.0 1.00 0.00\n" + "HETATM 3 O1 SO4 A 200 6.0 5.0 5.0 1.00 0.00\n" + "HETATM 4 O2 SO4 A 200 7.0 5.0 5.0 1.00 0.00\n" + "HETATM 5 O3 SO4 A 200 8.0 5.0 5.0 1.00 0.00\n" + "HETATM 6 O4 SO4 A 200 9.0 5.0 5.0 1.00 0.00\n" + "END\n" + ) + # fmt: on + result, removed = remove_artifacts(pdb) + + assert "SO4" not in result + assert removed["SO4"] == 5 + + def test_keeps_zinc(self): + """Test that zinc ions are preserved.""" + # fmt: off + pdb = ( + "ATOM 1 N ALA A 1 0.0 0.0 0.0 1.00 0.00\n" + "HETATM 2 ZN ZN A 200 5.0 5.0 5.0 1.00 0.00\n" + "END\n" + ) + # fmt: on + result, removed = remove_artifacts(pdb) + + assert "ZN" in result + assert "ZN" not in removed + + def test_keeps_heme(self): + """Test that heme is preserved (not an artifact).""" + # fmt: off + pdb = ( + "ATOM 1 N ALA A 1 0.0 0.0 0.0 1.00 0.00\n" + "HETATM 2 FE HEM A 200 5.0 5.0 5.0 1.00 0.00\n" + "HETATM 3 NA HEM A 200 6.0 5.0 5.0 1.00 0.00\n" + "END\n" + ) + # fmt: on + result, removed = remove_artifacts(pdb) + + assert "HEM" in result + assert "HEM" not in removed + + def test_whitelist_preserves_artifact(self): + """Test that whitelisted artifacts are preserved.""" + # fmt: off + pdb = ( + "ATOM 1 N ALA A 1 0.0 0.0 0.0 1.00 0.00\n" + "HETATM 2 O1 GOL A 100 5.0 5.0 5.0 1.00 0.00\n" + "END\n" + ) + # fmt: on + result, removed = remove_artifacts(pdb, keep_residues={"GOL"}) + + assert "GOL" in result + assert "GOL" not in removed + + def test_whitelist_case_insensitive(self): + """Test that whitelist is case insensitive.""" + # fmt: off + pdb = ( + "ATOM 1 N ALA A 1 0.0 0.0 0.0 1.00 0.00\n" + "HETATM 2 O1 GOL A 100 5.0 5.0 5.0 1.00 0.00\n" + "END\n" + ) + # fmt: on + result, removed = remove_artifacts(pdb, keep_residues={"gol"}) + + assert "GOL" in result + + def test_multiple_artifact_types(self): + """Test removal of multiple artifact types.""" + # fmt: off + pdb = ( + "ATOM 1 N ALA A 1 0.0 0.0 0.0 1.00 0.00\n" + "HETATM 2 O1 GOL A 100 5.0 5.0 5.0 1.00 0.00\n" + "HETATM 3 S SO4 A 200 6.0 6.0 6.0 1.00 0.00\n" + "HETATM 4 CL CL A 300 7.0 7.0 7.0 1.00 0.00\n" + "END\n" + ) + # fmt: on + result, removed = remove_artifacts(pdb) + + assert "GOL" not in result + assert "SO4" not in result + assert "CL" not in result + assert removed["GOL"] == 1 + assert removed["SO4"] == 1 + assert removed["CL"] == 1 + + def test_empty_pdb(self): + """Test handling of empty PDB.""" + pdb = "" + result, removed = remove_artifacts(pdb) + assert result == "" + assert len(removed) == 0 + + def test_protein_only_pdb(self): + """Test handling of PDB with no artifacts.""" + # fmt: off + pdb = ( + "ATOM 1 N ALA A 1 0.0 0.0 0.0 1.00 0.00\n" + "ATOM 2 CA ALA A 1 1.5 0.0 0.0 1.00 0.00\n" + "END\n" + ) + # fmt: on + result, removed = remove_artifacts(pdb) + + assert result == pdb + assert len(removed) == 0 + + +class TestRemoveArtifactsDetergentsAndLipids: + """Tests specifically for detergent and lipid removal.""" + + def test_removes_palmitic_acid(self): + """Test that palmitic acid is removed.""" + # fmt: off + pdb = ( + "ATOM 1 N ALA A 1 0.0 0.0 0.0 1.00 0.00\n" + "HETATM 2 C1 PLM A 100 5.0 5.0 5.0 1.00 0.00\n" + "END\n" + ) + # fmt: on + result, removed = remove_artifacts(pdb) + + assert "PLM" not in result + assert removed["PLM"] == 1 + + def test_removes_octyl_glucoside(self): + """Test that octyl glucoside detergent is removed.""" + # fmt: off + pdb = ( + "ATOM 1 N ALA A 1 0.0 0.0 0.0 1.00 0.00\n" + "HETATM 2 C1 BOG A 100 5.0 5.0 5.0 1.00 0.00\n" + "END\n" + ) + # fmt: on + result, removed = remove_artifacts(pdb) + + assert "BOG" not in result + assert removed["BOG"] == 1 + + def test_removes_ldao(self): + """Test that LDAO detergent is removed.""" + # fmt: off + pdb = ( + "ATOM 1 N ALA A 1 0.0 0.0 0.0 1.00 0.00\n" + "HETATM 2 N LDA A 100 5.0 5.0 5.0 1.00 0.00\n" + "END\n" + ) + # fmt: on + result, removed = remove_artifacts(pdb) + + assert "LDA" not in result + assert removed["LDA"] == 1 From 160ddf7204b47f7308e6d1a62a402ff22c2329a1 Mon Sep 17 00:00:00 2001 From: delalamo Date: Tue, 13 Jan 2026 10:42:21 -0500 Subject: [PATCH 10/22] Simplify --keep-ligand to accept comma-separated values only - Remove action="append" in favor of single comma-separated string - Fix parsing logic to handle single string instead of list - Add unit tests for --keep-ligand CLI argument parsing Co-Authored-By: Claude Opus 4.5 --- src/graphrelax/cli.py | 15 ++++---- tests/test_artifacts.py | 78 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 86 insertions(+), 7 deletions(-) diff --git a/src/graphrelax/cli.py b/src/graphrelax/cli.py index 67ccb45..2dd0469 100644 --- a/src/graphrelax/cli.py +++ b/src/graphrelax/cli.py @@ -274,12 +274,10 @@ def create_parser() -> argparse.ArgumentParser: preprocess_group.add_argument( "--keep-ligand", type=str, - action="append", - metavar="RESNAME", + metavar="RESNAMES", help=( - "Keep a specific ligand residue that would otherwise be stripped " - "as an artifact. Can be used multiple times. " - "Example: --keep-ligand GOL --keep-ligand SO4" + "Keep specific ligand residue(s) that would otherwise be stripped " + "as artifacts. Comma-separated. Example: --keep-ligand GOL,SO4" ), ) preprocess_group.add_argument( @@ -433,10 +431,13 @@ def main(args=None) -> int: ligand_smiles=ligand_smiles, ) - # Build keep_residues set from --keep-ligand flags + # Build keep_residues set from --keep-ligand flag (comma-separated) keep_residues = set() if opts.keep_ligand: - keep_residues = {r.upper() for r in opts.keep_ligand} + for resname in opts.keep_ligand.split(","): + resname = resname.strip().upper() + if resname: + keep_residues.add(resname) idealize_config = IdealizeConfig( enabled=opts.pre_idealize, diff --git a/tests/test_artifacts.py b/tests/test_artifacts.py index ded13bb..3d8184a 100644 --- a/tests/test_artifacts.py +++ b/tests/test_artifacts.py @@ -324,3 +324,81 @@ def test_removes_ldao(self): assert "LDA" not in result assert removed["LDA"] == 1 + + +class TestCLIKeepLigandParsing: + """Tests for CLI --keep-ligand argument parsing.""" + + def test_single_ligand(self): + """Test parsing a single ligand name.""" + from graphrelax.cli import create_parser + + parser = create_parser() + args = parser.parse_args( + ["-i", "in.pdb", "-o", "out.pdb", "--keep-ligand", "GOL"] + ) + assert args.keep_ligand == "GOL" + + def test_comma_separated_ligands(self): + """Test parsing comma-separated ligand names.""" + from graphrelax.cli import create_parser + + parser = create_parser() + args = parser.parse_args( + ["-i", "in.pdb", "-o", "out.pdb", "--keep-ligand", "GOL,SO4,EDO"] + ) + assert args.keep_ligand == "GOL,SO4,EDO" + + # Test the parsing logic that builds the keep_residues set + keep_residues = set() + if args.keep_ligand: + for resname in args.keep_ligand.split(","): + resname = resname.strip().upper() + if resname: + keep_residues.add(resname) + + assert keep_residues == {"GOL", "SO4", "EDO"} + + def test_comma_separated_with_spaces(self): + """Test parsing comma-separated ligand names with spaces.""" + from graphrelax.cli import create_parser + + parser = create_parser() + args = parser.parse_args( + ["-i", "in.pdb", "-o", "out.pdb", "--keep-ligand", "GOL, SO4, EDO"] + ) + + keep_residues = set() + if args.keep_ligand: + for resname in args.keep_ligand.split(","): + resname = resname.strip().upper() + if resname: + keep_residues.add(resname) + + assert keep_residues == {"GOL", "SO4", "EDO"} + + def test_lowercase_normalized_to_uppercase(self): + """Test that lowercase ligand names are normalized to uppercase.""" + from graphrelax.cli import create_parser + + parser = create_parser() + args = parser.parse_args( + ["-i", "in.pdb", "-o", "out.pdb", "--keep-ligand", "gol,so4"] + ) + + keep_residues = set() + if args.keep_ligand: + for resname in args.keep_ligand.split(","): + resname = resname.strip().upper() + if resname: + keep_residues.add(resname) + + assert keep_residues == {"GOL", "SO4"} + + def test_no_keep_ligand(self): + """Test that keep_ligand is None when not provided.""" + from graphrelax.cli import create_parser + + parser = create_parser() + args = parser.parse_args(["-i", "in.pdb", "-o", "out.pdb"]) + assert args.keep_ligand is None From 4e7b39c1b204c1e95b929902cb4240e68d043161 Mon Sep 17 00:00:00 2001 From: delalamo Date: Tue, 13 Jan 2026 10:51:45 -0500 Subject: [PATCH 11/22] Remove redundant code and update README Code cleanup: - Consolidate WATER_RESIDUES to artifacts.py (was defined in 3 places) - Create shared check_gpu_available() in utils.py (was duplicated) - Remove unused add_ter_records_at_gaps() from chain_gaps.py - Remove unused _relax_direct() method from relaxer.py (~100 lines) - Remove corresponding test class TestAddTerRecordsAtGaps README update: - Update --keep-ligand documentation to show comma-separated syntax Total: ~180 lines of redundant code removed. Co-Authored-By: Claude Opus 4.5 --- README.md | 9 ++- src/graphrelax/chain_gaps.py | 57 -------------- src/graphrelax/idealize.py | 11 +-- src/graphrelax/ligand_utils.py | 7 +- src/graphrelax/relaxer.py | 134 ++------------------------------- src/graphrelax/utils.py | 39 ++++++++-- tests/test_chain_gaps.py | 22 ------ 7 files changed, 51 insertions(+), 228 deletions(-) diff --git a/README.md b/README.md index 83c1d43..4d4e325 100644 --- a/README.md +++ b/README.md @@ -193,8 +193,8 @@ graphrelax -i crystal_structure.pdb -o relaxed.pdb # Keep all HETATM records including artifacts graphrelax -i crystal_structure.pdb -o relaxed.pdb --keep-all-ligands -# Keep specific artifacts that you need -graphrelax -i crystal_structure.pdb -o relaxed.pdb --keep-ligand GOL --keep-ligand SO4 +# Keep specific artifacts that you need (comma-separated) +graphrelax -i crystal_structure.pdb -o relaxed.pdb --keep-ligand GOL,SO4 ``` ### Working with Ligands @@ -349,8 +349,9 @@ Input preprocessing: --keep-waters Keep water molecules in input (default: removed) --keep-all-ligands Keep all HETATM records including crystallography artifacts. By default, common artifacts are removed. - --keep-ligand RESNAME Keep specific ligand residue (can be used multiple - times). Example: --keep-ligand GOL --keep-ligand SO4 + --keep-ligand RES1,RES2,... + Keep specific ligand residues (comma-separated). + Example: --keep-ligand GOL,SO4 --pre-idealize Idealize backbone geometry before processing. Corrects bond lengths/angles while preserving dihedral angles. By default, chain breaks are closed. diff --git a/src/graphrelax/chain_gaps.py b/src/graphrelax/chain_gaps.py index e7404b7..a4987ee 100644 --- a/src/graphrelax/chain_gaps.py +++ b/src/graphrelax/chain_gaps.py @@ -278,63 +278,6 @@ def restore_chain_ids( return "\n".join(output_lines) -def add_ter_records_at_gaps(pdb_string: str, gaps: List[ChainGap]) -> str: - """ - Add TER records at gap locations to signal chain breaks. - - This is an alternative to chain splitting that works with force fields - that recognize TER as chain termination. - - Args: - pdb_string: PDB file contents as string - gaps: List of detected gaps - - Returns: - PDB string with TER records inserted at gap locations - """ - if not gaps: - return pdb_string - - # Build set of (chain_id, resnum, icode) where TER should be inserted - # TER goes after residue_before - ter_locations = { - (g.chain_id, g.residue_before, g.icode_before) for g in gaps - } - - output_lines = [] - prev_chain = None - prev_resnum = None - prev_icode = None - - for line in pdb_string.split("\n"): - if line.startswith(("ATOM", "HETATM")) and len(line) > 26: - chain_id = line[21] - resnum = int(line[22:26].strip()) - icode = line[26].strip() if len(line) > 26 else "" - - # Check if we need to insert TER before this line - # (i.e., previous residue was at a gap location) - if ( - prev_chain is not None - and (prev_chain, prev_resnum, prev_icode) in ter_locations - and (chain_id != prev_chain or resnum != prev_resnum) - ): - # Insert TER record - ter_line = "TER " - output_lines.append(ter_line) - logger.debug( - f"Inserted TER after {prev_chain}:{prev_resnum}{prev_icode}" - ) - - prev_chain = chain_id - prev_resnum = resnum - prev_icode = icode - - output_lines.append(line) - - return "\n".join(output_lines) - - def get_gap_summary(gaps: List[ChainGap]) -> str: """ Generate a human-readable summary of detected gaps. diff --git a/src/graphrelax/idealize.py b/src/graphrelax/idealize.py index 49e1236..5c814e9 100644 --- a/src/graphrelax/idealize.py +++ b/src/graphrelax/idealize.py @@ -20,6 +20,7 @@ from openmm import openmm, unit from pdbfixer import PDBFixer +from graphrelax.artifacts import WATER_RESIDUES from graphrelax.chain_gaps import ( ChainGap, detect_chain_gaps, @@ -27,6 +28,7 @@ split_chains_at_gaps, ) from graphrelax.config import IdealizeConfig +from graphrelax.utils import check_gpu_available # Add vendored LigandMPNN to path for OpenFold imports LIGANDMPNN_PATH = Path(__file__).parent / "LigandMPNN" @@ -37,9 +39,6 @@ logger = logging.getLogger(__name__) -# Water residue names to preserve with protein -WATER_RESIDUES = {"HOH", "WAT", "SOL", "TIP3", "TIP4", "SPC"} - @dataclass class DihedralAngles: @@ -482,11 +481,7 @@ def minimize_with_constraints( ) # Check for GPU - use_gpu = False - for i in range(Platform.getNumPlatforms()): - if Platform.getPlatform(i).getName() == "CUDA": - use_gpu = True - break + use_gpu = check_gpu_available() # Create simulation integrator = openmm.LangevinIntegrator(0, 0.01, 0.0) diff --git a/src/graphrelax/ligand_utils.py b/src/graphrelax/ligand_utils.py index 6c13ef8..c774e07 100644 --- a/src/graphrelax/ligand_utils.py +++ b/src/graphrelax/ligand_utils.py @@ -23,6 +23,9 @@ logger = logging.getLogger(__name__) +# Import WATER_RESIDUES from artifacts to avoid duplication +from graphrelax.artifacts import WATER_RESIDUES # noqa: E402 + def _check_ligand_deps(): """Raise ImportError if ligand dependencies are not available.""" @@ -36,10 +39,6 @@ def _check_ligand_deps(): ) -# Water residue names to exclude from ligand processing -WATER_RESIDUES = {"HOH", "WAT", "SOL", "TIP3", "TIP4", "SPC"} - - @dataclass class LigandInfo: """Information about a ligand extracted from PDB.""" diff --git a/src/graphrelax/relaxer.py b/src/graphrelax/relaxer.py index 61e613b..faa1d44 100644 --- a/src/graphrelax/relaxer.py +++ b/src/graphrelax/relaxer.py @@ -4,14 +4,14 @@ import logging import sys from pathlib import Path -from typing import Optional, Tuple +from typing import Tuple import numpy as np -from openmm import Platform from openmm import app as openmm_app from openmm import openmm, unit from pdbfixer import PDBFixer +from graphrelax.artifacts import WATER_RESIDUES from graphrelax.chain_gaps import ( detect_chain_gaps, get_gap_summary, @@ -21,11 +21,11 @@ from graphrelax.config import RelaxConfig from graphrelax.idealize import extract_ligands, restore_ligands from graphrelax.ligand_utils import ( - WATER_RESIDUES, create_openff_molecule, extract_ligands_from_pdb, ligand_pdb_to_topology, ) +from graphrelax.utils import check_gpu_available # openmmforcefields is only available via conda-forge # Install with: conda install -c conda-forge openmmforcefields=0.13.0 @@ -66,22 +66,6 @@ class Relaxer: def __init__(self, config: RelaxConfig): self.config = config - self._use_gpu: Optional[bool] = None - - def _check_gpu_available(self) -> bool: - """Check if CUDA is available for OpenMM.""" - if self._use_gpu is not None: - return self._use_gpu - - for i in range(Platform.getNumPlatforms()): - if Platform.getPlatform(i).getName() == "CUDA": - self._use_gpu = True - logger.info("OpenMM CUDA platform detected, using GPU") - return True - - self._use_gpu = False - logger.info("OpenMM CUDA not available, using CPU") - return False def relax(self, pdb_string: str) -> Tuple[str, dict, np.ndarray]: """ @@ -165,7 +149,7 @@ def relax_protein(self, prot) -> Tuple[str, dict, np.ndarray]: Returns: Tuple of (relaxed_pdb_string, debug_info, violations) """ - use_gpu = self._check_gpu_available() + use_gpu = check_gpu_available() relaxer = AmberRelaxation( max_iterations=self.config.max_iterations, @@ -223,7 +207,7 @@ def _relax_unconstrained( ENERGY = unit.kilocalories_per_mole LENGTH = unit.angstroms - use_gpu = self._check_gpu_available() + use_gpu = check_gpu_available() logger.info( f"Running unconstrained OpenMM minimization " @@ -329,7 +313,7 @@ def _relax_unconstrained_with_ligands( ENERGY = unit.kilocalories_per_mole LENGTH = unit.angstroms - use_gpu = self._check_gpu_available() + use_gpu = check_gpu_available() logger.info( f"Running unconstrained minimization with ligands " @@ -450,110 +434,6 @@ def _relax_unconstrained_with_ligands( violations = np.zeros(0) return relaxed_pdb, debug_data, violations - def _relax_direct(self, pdb_string: str) -> Tuple[str, dict, np.ndarray]: - """ - Direct OpenMM minimization without pdbfixer. - - This is a simpler approach that works for already-complete structures - (like those from LigandMPNN with packed side chains). - - Args: - pdb_string: PDB file contents as string - - Returns: - Tuple of (relaxed_pdb_string, debug_info, violations) - """ - ENERGY = unit.kilocalories_per_mole - LENGTH = unit.angstroms - - use_gpu = self._check_gpu_available() - - logger.info( - f"Running direct OpenMM minimization " - f"(max_iter={self.config.max_iterations}, " - f"stiffness={self.config.stiffness}, gpu={use_gpu})" - ) - - # Parse PDB - pdb_file = io.StringIO(pdb_string) - pdb = openmm_app.PDBFile(pdb_file) - - # Create force field and system - force_field = openmm_app.ForceField( - "amber14-all.xml", "amber14/tip3pfb.xml" - ) - - # Use Modeller to add hydrogens (doesn't require pdbfixer) - modeller = openmm_app.Modeller(pdb.topology, pdb.positions) - modeller.addHydrogens(force_field) - - # Create system with constraints on hydrogen bonds - system = force_field.createSystem( - modeller.topology, constraints=openmm_app.HBonds - ) - - # Add position restraints if stiffness > 0 - if self.config.stiffness > 0: - self._add_restraints( - system, modeller, self.config.stiffness * ENERGY / (LENGTH**2) - ) - - # Create integrator and simulation - integrator = openmm.LangevinIntegrator(0, 0.01, 0.0) - platform = openmm.Platform.getPlatformByName( - "CUDA" if use_gpu else "CPU" - ) - simulation = openmm_app.Simulation( - modeller.topology, system, integrator, platform - ) - simulation.context.setPositions(modeller.positions) - - # Get initial energy - state = simulation.context.getState(getEnergy=True, getPositions=True) - einit = state.getPotentialEnergy().value_in_unit(ENERGY) - posinit = state.getPositions(asNumpy=True).value_in_unit(LENGTH) - - # Minimize - # OpenMM minimizeEnergy tolerance is in kJ/mol/nm (gradient threshold) - tolerance = ( - self.config.tolerance * unit.kilojoules_per_mole / unit.nanometer - ) - simulation.minimizeEnergy( - maxIterations=self.config.max_iterations, tolerance=tolerance - ) - - # Get final state - state = simulation.context.getState(getEnergy=True, getPositions=True) - efinal = state.getPotentialEnergy().value_in_unit(ENERGY) - pos = state.getPositions(asNumpy=True).value_in_unit(LENGTH) - - # Calculate RMSD - rmsd = np.sqrt(np.sum((posinit - pos) ** 2) / len(posinit)) - - # Write output PDB - output = io.StringIO() - openmm_app.PDBFile.writeFile( - simulation.topology, state.getPositions(), output - ) - relaxed_pdb = output.getvalue() - - debug_data = { - "initial_energy": einit, - "final_energy": efinal, - "rmsd": rmsd, - "attempts": 1, - } - - logger.info( - f"Relaxation complete: E_init={einit:.2f}, " - f"E_final={efinal:.2f}, RMSD={rmsd:.3f} A" - ) - - # No violations tracking in direct mode - violations = np.zeros(0) - - return relaxed_pdb, debug_data, violations - def _add_restraints(self, system, modeller, stiffness): """Add harmonic position restraints to heavy atoms.""" force = openmm.CustomExternalForce( @@ -603,7 +483,7 @@ def get_energy_breakdown(self, pdb_string: str) -> dict: ) # Create simulation - use_gpu = self._check_gpu_available() + use_gpu = check_gpu_available() platform = openmm.Platform.getPlatformByName( "CUDA" if use_gpu else "CPU" ) diff --git a/src/graphrelax/utils.py b/src/graphrelax/utils.py index 9e2ae91..0eec674 100644 --- a/src/graphrelax/utils.py +++ b/src/graphrelax/utils.py @@ -5,10 +5,40 @@ from pathlib import Path from typing import Optional +from graphrelax.artifacts import WATER_RESIDUES from graphrelax.structure_io import StructureFormat logger = logging.getLogger(__name__) +# Cache for GPU availability check +_gpu_available = None + + +def check_gpu_available() -> bool: + """ + Check if CUDA is available for OpenMM. + + Results are cached after first check. + + Returns: + True if CUDA platform is available + """ + global _gpu_available + if _gpu_available is not None: + return _gpu_available + + from openmm import Platform + + for i in range(Platform.getNumPlatforms()): + if Platform.getPlatform(i).getName() == "CUDA": + _gpu_available = True + logger.info("OpenMM CUDA platform detected, using GPU") + return True + + _gpu_available = False + logger.info("OpenMM CUDA not available, using CPU") + return False + def remove_waters(structure_string: str, fmt: StructureFormat = None) -> str: """ @@ -39,7 +69,6 @@ def _remove_waters_pdb(pdb_string: str) -> str: Returns: PDB string with water molecules removed """ - water_residues = {"HOH", "WAT", "SOL", "TIP3", "TIP4", "SPC"} filtered_lines = [] for line in pdb_string.splitlines(): @@ -48,13 +77,13 @@ def _remove_waters_pdb(pdb_string: str) -> str: # Residue name is in columns 17-20 (0-indexed: 17:20) if len(line) >= 20: resname = line[17:20].strip() - if resname in water_residues: + if resname in WATER_RESIDUES: continue # Check TER records that might reference water elif line.startswith("TER"): if len(line) >= 20: resname = line[17:20].strip() - if resname in water_residues: + if resname in WATER_RESIDUES: continue filtered_lines.append(line) @@ -77,11 +106,9 @@ def _remove_waters_cif(cif_string: str) -> str: from Bio.PDB import MMCIFIO, MMCIFParser, Select - water_residues = {"HOH", "WAT", "SOL", "TIP3", "TIP4", "SPC"} - class WaterRemover(Select): def accept_residue(self, residue): - return residue.get_resname().strip() not in water_residues + return residue.get_resname().strip() not in WATER_RESIDUES # MMCIFParser requires a file path with tempfile.NamedTemporaryFile( diff --git a/tests/test_chain_gaps.py b/tests/test_chain_gaps.py index 53b29d7..c5119a2 100644 --- a/tests/test_chain_gaps.py +++ b/tests/test_chain_gaps.py @@ -4,7 +4,6 @@ from graphrelax.chain_gaps import ( ChainGap, - add_ter_records_at_gaps, detect_chain_gaps, get_gap_summary, restore_chain_ids, @@ -302,27 +301,6 @@ def test_roundtrip_preserves_atoms(self, gapped_peptide_pdb): assert orig_atoms == restored_atoms -class TestAddTerRecordsAtGaps: - """Tests for add_ter_records_at_gaps function.""" - - def test_no_ter_without_gaps(self, continuous_peptide_pdb): - """Test no TER records added to continuous peptide.""" - gaps = detect_chain_gaps(continuous_peptide_pdb, check_distance=False) - result = add_ter_records_at_gaps(continuous_peptide_pdb, gaps) - assert result == continuous_peptide_pdb - - def test_adds_ter_at_gap(self, gapped_peptide_pdb): - """Test TER record is added at gap location.""" - gaps = detect_chain_gaps(gapped_peptide_pdb, check_distance=False) - result = add_ter_records_at_gaps(gapped_peptide_pdb, gaps) - - # Count TER records - ter_count = sum( - 1 for line in result.split("\n") if line.startswith("TER") - ) - assert ter_count >= 1 - - class TestGetGapSummary: """Tests for get_gap_summary function.""" From 09e50939d97af917fbd60d1a97b1dfba7345a206 Mon Sep 17 00:00:00 2001 From: delalamo Date: Tue, 13 Jan 2026 11:06:37 -0500 Subject: [PATCH 12/22] Remove requirements.txt and fix integration tests - Remove requirements.txt (dependencies in pyproject.toml) - Update test_relaxer_integration.py to use public API: - Use check_gpu_available() from utils instead of removed method - Use relax() with constrained=False instead of removed _relax_direct() Co-Authored-By: Claude Opus 4.5 --- requirements.txt | 49 ---------------- tests/test_relaxer_integration.py | 92 ++++++++++++++++++------------- 2 files changed, 55 insertions(+), 86 deletions(-) delete mode 100644 requirements.txt diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index 3c80844..0000000 --- a/requirements.txt +++ /dev/null @@ -1,49 +0,0 @@ -# GraphRelax Requirements -# ====================== -# -# This file lists ALL dependencies needed to run GraphRelax with full -# functionality, including ligand support. -# -# INSTALLATION ORDER: -# 1. First install conda-forge packages (not available on PyPI) -# 2. Then install pip packages -# -# Quick install (recommended): -# conda install -c conda-forge pdbfixer openmmforcefields openff-toolkit rdkit -# pip install -r requirements.txt -# -# Or use the environment.yml file for a complete conda environment. - -# ============================================================================= -# CONDA-FORGE ONLY (must be installed via conda before pip) -# ============================================================================= -# These packages are NOT on PyPI and must be installed via conda-forge: -# -# conda install -c conda-forge \ -# pdbfixer \ -# openmmforcefields>=0.13.0 \ -# openff-toolkit>=0.14.0 \ -# rdkit>=2023.09.1 -# -# pdbfixer: Required for unconstrained minimization and ligand support -# openmmforcefields: Required for --include-ligands (ligand force fields) -# openff-toolkit: Required for --include-ligands (molecule parameterization) -# rdkit: Required for --include-ligands (bond perception from PDB) - -# ============================================================================= -# PIP PACKAGES (install after conda packages) -# ============================================================================= - -# Core dependencies -torch>=2.0 -numpy<2 # PyTorch <2.5 is incompatible with NumPy 2.x -openmm -biopython -prody -absl-py -ml-collections -dm-tree - -# Development/testing (optional) -# pytest>=7.0 -# pytest-cov>=4.1 diff --git a/tests/test_relaxer_integration.py b/tests/test_relaxer_integration.py index d62d92d..fb37596 100644 --- a/tests/test_relaxer_integration.py +++ b/tests/test_relaxer_integration.py @@ -20,29 +20,44 @@ def relaxer(): return Relaxer(config) +@pytest.fixture +def unconstrained_relaxer(): + """Create a Relaxer instance with unconstrained minimization.""" + from graphrelax.relaxer import Relaxer + + config = RelaxConfig(max_iterations=50, stiffness=10.0, constrained=False) + return Relaxer(config) + + @pytest.mark.integration class TestRelaxerGPUDetection: """Tests for GPU detection logic.""" - def test_check_gpu_available_returns_bool(self, relaxer): + def test_check_gpu_available_returns_bool(self): """Test that GPU check returns a boolean.""" - result = relaxer._check_gpu_available() + from graphrelax.utils import check_gpu_available + + result = check_gpu_available() assert isinstance(result, bool) - def test_check_gpu_available_cached(self, relaxer): + def test_check_gpu_available_cached(self): """Test that GPU check result is cached.""" - result1 = relaxer._check_gpu_available() - result2 = relaxer._check_gpu_available() + from graphrelax.utils import check_gpu_available + + result1 = check_gpu_available() + result2 = check_gpu_available() assert result1 == result2 @pytest.mark.integration -class TestRelaxDirect: - """Tests for direct OpenMM minimization.""" +class TestRelaxUnconstrained: + """Tests for unconstrained OpenMM minimization.""" - def test_relax_direct_runs(self, relaxer, small_peptide_pdb_string): - """Test that direct relaxation completes without error.""" - relaxed_pdb, debug_info, violations = relaxer._relax_direct( + def test_relax_unconstrained_runs( + self, unconstrained_relaxer, small_peptide_pdb_string + ): + """Test that unconstrained relaxation completes without error.""" + relaxed_pdb, debug_info, violations = unconstrained_relaxer.relax( small_peptide_pdb_string ) @@ -50,39 +65,44 @@ def test_relax_direct_runs(self, relaxer, small_peptide_pdb_string): assert isinstance(relaxed_pdb, str) assert len(relaxed_pdb) > 0 - def test_relax_direct_returns_debug_info( - self, relaxer, small_peptide_pdb_string + def test_relax_unconstrained_returns_debug_info( + self, unconstrained_relaxer, small_peptide_pdb_string ): """Test that debug info contains expected keys.""" - _, debug_info, _ = relaxer._relax_direct(small_peptide_pdb_string) + _, debug_info, _ = unconstrained_relaxer.relax(small_peptide_pdb_string) assert "initial_energy" in debug_info assert "final_energy" in debug_info assert "rmsd" in debug_info - assert "attempts" in debug_info - def test_relax_direct_energy_types(self, relaxer, small_peptide_pdb_string): + def test_relax_unconstrained_energy_types( + self, unconstrained_relaxer, small_peptide_pdb_string + ): """Test that energy values are numeric.""" - _, debug_info, _ = relaxer._relax_direct(small_peptide_pdb_string) + _, debug_info, _ = unconstrained_relaxer.relax(small_peptide_pdb_string) assert isinstance(debug_info["initial_energy"], (int, float)) assert isinstance(debug_info["final_energy"], (int, float)) assert isinstance(debug_info["rmsd"], (int, float)) - def test_relax_direct_pdb_format(self, relaxer, small_peptide_pdb_string): + def test_relax_unconstrained_pdb_format( + self, unconstrained_relaxer, small_peptide_pdb_string + ): """Test that output is valid PDB format.""" - relaxed_pdb, _, _ = relaxer._relax_direct(small_peptide_pdb_string) + relaxed_pdb, _, _ = unconstrained_relaxer.relax( + small_peptide_pdb_string + ) # Should contain ATOM records assert "ATOM" in relaxed_pdb or "HETATM" in relaxed_pdb - def test_relax_direct_violations_array( - self, relaxer, small_peptide_pdb_string + def test_relax_unconstrained_violations_array( + self, unconstrained_relaxer, small_peptide_pdb_string ): """Test that violations is a numpy array.""" import numpy as np - _, _, violations = relaxer._relax_direct(small_peptide_pdb_string) + _, _, violations = unconstrained_relaxer.relax(small_peptide_pdb_string) assert isinstance(violations, np.ndarray) @@ -135,30 +155,28 @@ def test_get_energy_breakdown_has_total( class TestRelaxerConfig: """Tests for Relaxer with different configurations.""" - def test_high_stiffness(self, small_peptide_pdb_string): - """Test relaxation with high stiffness (more restrained).""" + def test_high_stiffness_constrained(self, small_peptide_pdb_string): + """Test constrained relaxation with high stiffness (more restrained).""" from graphrelax.relaxer import Relaxer - config = RelaxConfig(max_iterations=50, stiffness=100.0) + config = RelaxConfig( + max_iterations=50, stiffness=100.0, constrained=True + ) relaxer = Relaxer(config) - relaxed_pdb, debug_info, _ = relaxer._relax_direct( - small_peptide_pdb_string - ) + relaxed_pdb, debug_info, _ = relaxer.relax(small_peptide_pdb_string) # High stiffness should result in small RMSD assert debug_info["rmsd"] < 1.0 # Less than 1 Angstrom - def test_zero_stiffness(self, small_peptide_pdb_string): - """Test relaxation with no restraints.""" + def test_unconstrained_minimization(self, small_peptide_pdb_string): + """Test unconstrained minimization (no restraints).""" from graphrelax.relaxer import Relaxer - config = RelaxConfig(max_iterations=50, stiffness=0.0) + config = RelaxConfig(max_iterations=50, constrained=False) relaxer = Relaxer(config) - relaxed_pdb, debug_info, _ = relaxer._relax_direct( - small_peptide_pdb_string - ) + relaxed_pdb, debug_info, _ = relaxer.relax(small_peptide_pdb_string) assert relaxed_pdb is not None @@ -166,11 +184,11 @@ def test_limited_iterations(self, small_peptide_pdb_string): """Test relaxation with limited iterations.""" from graphrelax.relaxer import Relaxer - config = RelaxConfig(max_iterations=10, stiffness=10.0) + config = RelaxConfig( + max_iterations=10, stiffness=10.0, constrained=False + ) relaxer = Relaxer(config) - relaxed_pdb, debug_info, _ = relaxer._relax_direct( - small_peptide_pdb_string - ) + relaxed_pdb, debug_info, _ = relaxer.relax(small_peptide_pdb_string) assert relaxed_pdb is not None From 916b4c15e6cd4d7282a59258937c5e8b396f77e0 Mon Sep 17 00:00:00 2001 From: delalamo Date: Tue, 13 Jan 2026 11:09:24 -0500 Subject: [PATCH 13/22] Clarify conda-forge dependencies in README - Make it clear that openmmforcefields, openff-toolkit, and rdkit are conda-forge only (like pdbfixer) - Add ligand support installation command to PyPI section - Reorganize dependencies table to show required vs optional Co-Authored-By: Claude Opus 4.5 --- README.md | 29 +++++++++++++++++++---------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index 4d4e325..70b3838 100644 --- a/README.md +++ b/README.md @@ -11,14 +11,20 @@ GraphRelax requires pdbfixer, which is only available via conda-forge. We recomm ### From PyPI (Latest Release) ```bash -# First, install pdbfixer via mamba/conda (required) +# First, install conda-forge dependencies (not available on PyPI) mamba install -c conda-forge pdbfixer # Then install graphrelax from PyPI pip install graphrelax ``` -This installs the latest stable release. +This installs the latest stable release with basic functionality. + +**For ligand support** (auto-parameterization of small molecules), also install: + +```bash +mamba install -c conda-forge openmmforcefields openff-toolkit rdkit +``` ### From Source (Latest Development Version) @@ -60,17 +66,20 @@ Core dependencies (installed automatically via pip): - absl-py - ml-collections -Required (must be installed separately via conda): +Conda-forge only (must be installed via mamba/conda, not pip): -- pdbfixer (conda-forge only, not on PyPI) +| Package | Required | Purpose | +| ----------------- | -------- | -------------------------------- | +| pdbfixer | Yes | Structure preparation | +| openmmforcefields | Optional | Small molecule force fields | +| openff-toolkit | Optional | OpenFF Molecule parameterization | +| rdkit | Optional | Bond perception from PDB | -Optional (for ligand support): +The optional packages are needed for **ligand support** (auto-parameterization of small molecules during minimization). Install all conda-forge dependencies with: -| Package | Version | Purpose | -| ----------------- | ------------ | --------------------------- | -| openmmforcefields | >= 0.13.0 | Small molecule force fields | -| openff-toolkit | >= 0.14.0 | OpenFF Molecule creation | -| rdkit | >= 2023.09.1 | Bond perception from PDB | +```bash +mamba install -c conda-forge pdbfixer openmmforcefields openff-toolkit rdkit +``` ### Docker From ba18f8e8038daa353cea44ac9f91e4da3fc482df Mon Sep 17 00:00:00 2001 From: delalamo Date: Tue, 13 Jan 2026 11:10:43 -0500 Subject: [PATCH 14/22] Make all conda-forge dependencies mandatory - pdbfixer, openmmforcefields, openff-toolkit, and rdkit are now all required for installation - Simplified installation instructions - Removed "optional" labeling from dependencies table Co-Authored-By: Claude Opus 4.5 --- README.md | 33 +++++++++------------------------ 1 file changed, 9 insertions(+), 24 deletions(-) diff --git a/README.md b/README.md index 70b3838..a386636 100644 --- a/README.md +++ b/README.md @@ -6,32 +6,21 @@ GraphRelax combines **LigandMPNN** (for sequence design and side-chain packing) ## Installation -GraphRelax requires pdbfixer, which is only available via conda-forge. We recommend using mamba for faster installation. +GraphRelax requires several packages that are only available via conda-forge. We recommend using mamba for faster installation. ### From PyPI (Latest Release) ```bash # First, install conda-forge dependencies (not available on PyPI) -mamba install -c conda-forge pdbfixer +mamba install -c conda-forge pdbfixer openmmforcefields openff-toolkit rdkit # Then install graphrelax from PyPI pip install graphrelax ``` -This installs the latest stable release with basic functionality. - -**For ligand support** (auto-parameterization of small molecules), also install: - -```bash -mamba install -c conda-forge openmmforcefields openff-toolkit rdkit -``` - ### From Source (Latest Development Version) ```bash -# First, install pdbfixer via mamba/conda (required) -mamba install -c conda-forge pdbfixer - # Clone the repository git clone https://github.com/delalamo/GraphRelax.git cd GraphRelax @@ -46,8 +35,6 @@ mamba install -c conda-forge pdbfixer openmmforcefields openff-toolkit rdkit pip install -e . ``` -This installs the latest development version with all recent changes. - LigandMPNN model weights (~40MB) are downloaded automatically on first run. > **Note:** `mamba` is a drop-in replacement for `conda` that's 10-100x faster. Install it with `conda install -n base -c conda-forge mamba`, or use `micromamba` as a standalone tool. @@ -68,14 +55,14 @@ Core dependencies (installed automatically via pip): Conda-forge only (must be installed via mamba/conda, not pip): -| Package | Required | Purpose | -| ----------------- | -------- | -------------------------------- | -| pdbfixer | Yes | Structure preparation | -| openmmforcefields | Optional | Small molecule force fields | -| openff-toolkit | Optional | OpenFF Molecule parameterization | -| rdkit | Optional | Bond perception from PDB | +| Package | Purpose | +| ----------------- | -------------------------------- | +| pdbfixer | Structure preparation | +| openmmforcefields | Small molecule force fields | +| openff-toolkit | OpenFF Molecule parameterization | +| rdkit | Bond perception from PDB | -The optional packages are needed for **ligand support** (auto-parameterization of small molecules during minimization). Install all conda-forge dependencies with: +Install all conda-forge dependencies with: ```bash mamba install -c conda-forge pdbfixer openmmforcefields openff-toolkit rdkit @@ -226,8 +213,6 @@ graphrelax -i protein_with_ligand.pdb -o designed.pdb \ graphrelax -i protein_with_ligand.pdb -o designed.pdb --ignore-ligands ``` -**Requires:** `mamba install -c conda-forge openmmforcefields openff-toolkit rdkit` - For design around ligands, use `ligand_mpnn` model type: ```bash From 0d99579a2625bd7cec1334e7047ec6baf40fd0c8 Mon Sep 17 00:00:00 2001 From: delalamo Date: Tue, 13 Jan 2026 12:45:26 -0500 Subject: [PATCH 15/22] Enable idealization by default and add missing residue control - Change idealization from opt-in (--pre-idealize) to opt-out (--no-idealize) - IdealizeConfig.enabled now defaults to True - Add --ignore-missing-residues flag to skip adding residues from SEQRES - Add --overwrite flag to allow overwriting output files - Preserve residue numbering with keepIds=True in PDB output - Update README to reflect new default behavior Co-Authored-By: Claude Opus 4.5 --- README.md | 45 ++++++----- .../LigandMPNN/openfold/np/relax/cleanup.py | 3 + src/graphrelax/cli.py | 79 ++++++++++++++++--- src/graphrelax/config.py | 3 +- src/graphrelax/idealize.py | 2 +- src/graphrelax/relaxer.py | 18 ++++- 6 files changed, 116 insertions(+), 34 deletions(-) diff --git a/README.md b/README.md index a386636..38e4c24 100644 --- a/README.md +++ b/README.md @@ -239,31 +239,36 @@ graphrelax -i protein_with_ligand.pdb -o designed.pdb \ **Requires:** `mamba install -c conda-forge pdbfixer` -### Pre-Idealization +### Backbone Idealization -GraphRelax can optionally idealize backbone geometry before processing. This is useful for structures with distorted bond lengths or angles (e.g., from homology modeling or low-resolution experimental data). The idealization step: +By default, GraphRelax idealizes backbone geometry before processing. This corrects distorted bond lengths and angles commonly found in experimental structures or homology models. The idealization step: 1. Corrects backbone bond lengths and angles to ideal values 2. Preserves phi/psi/omega dihedral angles -3. Adds missing atoms and optionally missing residues from SEQRES +3. Adds missing atoms 4. Runs constrained minimization to relieve local strain 5. By default, closes chain breaks (gaps) in the structure +**Note:** By default, GraphRelax adds missing residues from SEQRES records during both relaxation and idealization. This may change residue numbering if the input PDB is missing N/C-terminal residues. Use `--ignore-missing-residues` to preserve original numbering for resfile compatibility. + ```bash -# Idealize before relaxation -graphrelax -i input.pdb -o relaxed.pdb --pre-idealize +# Default: idealization is enabled +graphrelax -i input.pdb -o relaxed.pdb -# Idealize but don't add missing residues from SEQRES -graphrelax -i input.pdb -o relaxed.pdb --pre-idealize --ignore-missing-residues +# Skip idealization (use input geometry as-is) +graphrelax -i input.pdb -o relaxed.pdb --no-idealize -# Idealize but keep chain breaks as separate chains (don't close gaps) -graphrelax -i input.pdb -o relaxed.pdb --pre-idealize --retain-chainbreaks +# Don't add missing residues (preserve original numbering for resfiles) +graphrelax -i input.pdb -o relaxed.pdb --ignore-missing-residues + +# Keep chain breaks as separate chains (don't close gaps) +graphrelax -i input.pdb -o relaxed.pdb --retain-chainbreaks # Combine with design -graphrelax -i input.pdb -o designed.pdb --pre-idealize --design +graphrelax -i input.pdb -o designed.pdb --design ``` -**Note:** Pre-idealization requires pdbfixer (`mamba install -c conda-forge pdbfixer`). +**Note:** Idealization requires pdbfixer (`mamba install -c conda-forge pdbfixer`). ### Resfile Format @@ -346,15 +351,16 @@ Input preprocessing: --keep-ligand RES1,RES2,... Keep specific ligand residues (comma-separated). Example: --keep-ligand GOL,SO4 - --pre-idealize Idealize backbone geometry before processing. - Corrects bond lengths/angles while preserving - dihedral angles. By default, chain breaks are closed. - Requires pdbfixer. + --no-idealize Skip backbone idealization. By default, backbone + geometry is idealized (bond lengths/angles corrected + while preserving dihedrals). Requires pdbfixer. --ignore-missing-residues Do not add missing residues from SEQRES during - pre-idealization. By default, missing terminal and - internal loop residues are added. - --retain-chainbreaks Do not close chain breaks during pre-idealization. + processing. By default, missing terminal and + internal loop residues are added during relaxation + and idealization. Use this flag to preserve + original PDB residue numbering for resfile compatibility. + --retain-chainbreaks Do not close chain breaks during idealization. By default, chain breaks are closed by treating all segments as a single chain. @@ -364,6 +370,7 @@ Scoring: General: -v, --verbose Verbose output --seed N Random seed for reproducibility + --overwrite Overwrite output files if they exist (default: error) ``` ### Scorefile Output @@ -397,7 +404,7 @@ config = PipelineConfig( constrained=False, # Default: unconstrained minimization ), idealize=IdealizeConfig( - enabled=True, # Enable pre-idealization + enabled=True, # Idealization enabled by default add_missing_residues=True, # Add missing residues from SEQRES close_chainbreaks=True, # Close chain breaks (default) ), diff --git a/src/graphrelax/LigandMPNN/openfold/np/relax/cleanup.py b/src/graphrelax/LigandMPNN/openfold/np/relax/cleanup.py index 6f062a0..72b69f0 100644 --- a/src/graphrelax/LigandMPNN/openfold/np/relax/cleanup.py +++ b/src/graphrelax/LigandMPNN/openfold/np/relax/cleanup.py @@ -57,6 +57,9 @@ def fix_pdb(pdbfile, alterations_info): _remove_heterogens(fixer, alterations_info, keep_water=False) fixer.findMissingResidues() alterations_info["missing_residues"] = fixer.missingResidues + # Clear missing residues to preserve original residue numbering + # Missing residues can be added via --pre-idealize if desired + fixer.missingResidues = {} fixer.findMissingAtoms() alterations_info["missing_heavy_atoms"] = fixer.missingAtoms alterations_info["missing_terminals"] = fixer.missingTerminals diff --git a/src/graphrelax/cli.py b/src/graphrelax/cli.py index 2dd0469..532cd35 100644 --- a/src/graphrelax/cli.py +++ b/src/graphrelax/cli.py @@ -19,6 +19,43 @@ def setup_logging(verbose: bool): ) +def _get_output_paths(output_path: Path, n_outputs: int) -> list: + """ + Get list of all output file paths that will be written. + + Args: + output_path: Base output path + n_outputs: Number of outputs to generate + + Returns: + List of Path objects for all output files + """ + if n_outputs == 1: + return [output_path] + + paths = [] + stem = output_path.stem + suffix = output_path.suffix + for i in range(1, n_outputs + 1): + paths.append(output_path.parent / f"{stem}_{i}{suffix}") + return paths + + +def _check_output_exists(output_path: Path, n_outputs: int) -> list: + """ + Check if any output files already exist. + + Args: + output_path: Base output path + n_outputs: Number of outputs to generate + + Returns: + List of existing file paths + """ + paths = _get_output_paths(output_path, n_outputs) + return [p for p in paths if p.exists()] + + def _check_for_ligands(input_path: Path, fmt) -> bool: """ Check if input structure has ligands (non-water HETATM records). @@ -281,28 +318,29 @@ def create_parser() -> argparse.ArgumentParser: ), ) preprocess_group.add_argument( - "--pre-idealize", + "--no-idealize", action="store_true", help=( - "Idealize backbone geometry before processing. " - "Runs constrained minimization to fix local geometry while " - "preserving dihedral angles. By default, chain breaks are closed." + "Skip backbone idealization. By default, backbone geometry is " + "idealized before processing (corrects bond lengths/angles while " + "preserving dihedral angles). Use this flag to skip idealization." ), ) preprocess_group.add_argument( "--ignore-missing-residues", action="store_true", help=( - "Do not add missing residues from SEQRES during pre-idealization. " - "By default, missing N/C-terminal residues and internal loops are " - "added based on SEQRES records." + "Do not add missing residues from SEQRES. By default, missing " + "N/C-terminal residues and internal loops are added. Use this " + "to preserve original PDB residue numbering for resfile " + "compatibility." ), ) preprocess_group.add_argument( "--retain-chainbreaks", action="store_true", help=( - "Do not close chain breaks during pre-idealization. " + "Do not close chain breaks during idealization. " "By default, chain breaks are closed by treating all segments " "as a single chain. Use this to preserve gaps." ), @@ -322,6 +360,11 @@ def create_parser() -> argparse.ArgumentParser: metavar="N", help="Random seed for reproducibility", ) + general_group.add_argument( + "--overwrite", + action="store_true", + help="Overwrite output files if they exist (default: error if exists)", + ) return parser @@ -343,6 +386,23 @@ def main(args=None) -> int: logger.error(f"Resfile not found: {opts.resfile}") return 1 + # Check if output files already exist (unless --overwrite is set) + if not opts.overwrite: + existing = _check_output_exists(opts.output, opts.n_outputs) + if existing: + if len(existing) == 1: + logger.error( + f"Output file already exists: {existing[0]}. " + "Use --overwrite to replace." + ) + else: + files = ", ".join(str(p) for p in existing) + logger.error( + f"Output files already exist: {files}. " + "Use --overwrite to replace." + ) + return 1 + # Validate input format input_suffix = opts.input.suffix.lower() if input_suffix not in (".pdb", ".cif", ".mmcif"): @@ -426,6 +486,7 @@ def main(args=None) -> int: max_iterations=opts.max_iterations, constrained=opts.constrained_minimization, split_chains_at_gaps=not opts.no_split_gaps, + add_missing_residues=not opts.ignore_missing_residues, ignore_ligands=opts.ignore_ligands, ligand_forcefield=opts.ligand_forcefield, ligand_smiles=ligand_smiles, @@ -440,7 +501,7 @@ def main(args=None) -> int: keep_residues.add(resname) idealize_config = IdealizeConfig( - enabled=opts.pre_idealize, + enabled=not opts.no_idealize, add_missing_residues=not opts.ignore_missing_residues, close_chainbreaks=not opts.retain_chainbreaks, ) diff --git a/src/graphrelax/config.py b/src/graphrelax/config.py index 81a126e..6597ca2 100644 --- a/src/graphrelax/config.py +++ b/src/graphrelax/config.py @@ -44,6 +44,7 @@ class RelaxConfig: max_outer_iterations: int = 3 # Violation-fixing iterations constrained: bool = False # Use constrained (AmberRelaxation) minimization split_chains_at_gaps: bool = True # Split chains at gaps to prevent closure + add_missing_residues: bool = True # Add missing residues from SEQRES # GPU is auto-detected and used when available # Ligand support options (ligands are auto-detected) @@ -60,7 +61,7 @@ class RelaxConfig: class IdealizeConfig: """Configuration for structure idealization preprocessing.""" - enabled: bool = False # Idealization disabled by default + enabled: bool = True # Idealization enabled by default fix_cis_omega: bool = True # Correct non-trans peptide bonds (except Pro) post_idealize_stiffness: float = 10.0 # kcal/mol/A^2 for restraint add_missing_residues: bool = True # Add missing residues from SEQRES diff --git a/src/graphrelax/idealize.py b/src/graphrelax/idealize.py index 5c814e9..3ce8336 100644 --- a/src/graphrelax/idealize.py +++ b/src/graphrelax/idealize.py @@ -498,7 +498,7 @@ def minimize_with_constraints( state = simulation.context.getState(getPositions=True) output = io.StringIO() openmm_app.PDBFile.writeFile( - simulation.topology, state.getPositions(), output + simulation.topology, state.getPositions(), output, keepIds=True ) logger.debug("Post-idealization minimization complete") diff --git a/src/graphrelax/relaxer.py b/src/graphrelax/relaxer.py index faa1d44..1e3955d 100644 --- a/src/graphrelax/relaxer.py +++ b/src/graphrelax/relaxer.py @@ -217,6 +217,11 @@ def _relax_unconstrained( # Use pdbfixer to add missing atoms and terminal groups fixer = PDBFixer(pdbfile=io.StringIO(pdb_string)) fixer.findMissingResidues() + if not self.config.add_missing_residues: + fixer.missingResidues = {} # Clear to preserve original numbering + elif fixer.missingResidues: + n_missing = sum(len(v) for v in fixer.missingResidues.values()) + logger.info(f"Adding {n_missing} missing residues from SEQRES") fixer.findMissingAtoms() fixer.addMissingAtoms() @@ -263,10 +268,10 @@ def _relax_unconstrained( # Calculate RMSD rmsd = np.sqrt(np.sum((posinit - pos) ** 2) / len(posinit)) - # Write output PDB + # Write output PDB (keepIds=True preserves original residue numbering) output = io.StringIO() openmm_app.PDBFile.writeFile( - simulation.topology, state.getPositions(), output + simulation.topology, state.getPositions(), output, keepIds=True ) relaxed_pdb = output.getvalue() @@ -328,6 +333,11 @@ def _relax_unconstrained_with_ligands( # Step 2: Fix protein with pdbfixer (without ligands) fixer = PDBFixer(pdbfile=io.StringIO(protein_pdb)) fixer.findMissingResidues() + if not self.config.add_missing_residues: + fixer.missingResidues = {} # Clear to preserve original numbering + elif fixer.missingResidues: + n_missing = sum(len(v) for v in fixer.missingResidues.values()) + logger.info(f"Adding {n_missing} missing residues from SEQRES") fixer.findMissingAtoms() fixer.addMissingAtoms() @@ -410,10 +420,10 @@ def _relax_unconstrained_with_ligands( # Calculate RMSD rmsd = np.sqrt(np.sum((posinit - pos) ** 2) / len(posinit)) - # Write output PDB + # Write output PDB (keepIds=True preserves original residue numbering) output = io.StringIO() openmm_app.PDBFile.writeFile( - simulation.topology, state.getPositions(), output + simulation.topology, state.getPositions(), output, keepIds=True ) relaxed_pdb = output.getvalue() From 434e11f3cb580e350cf2b8092c5e07382bbbb12e Mon Sep 17 00:00:00 2001 From: delalamo Date: Tue, 13 Jan 2026 13:09:43 -0500 Subject: [PATCH 16/22] Fix residue numbering after idealization and add resfile warning - Add renumber_residues_sequential() to idealize.py to ensure sequential residue numbering (1, 2, 3...) per chain after pdbfixer adds missing residues. This fixes false chain gap detection caused by non-sequential numbers from pdbfixer. - Add CLI warning when using resfile with idealization enabled, since residue numbers in the resfile must match the idealized structure - Update README to document the residue renumbering behavior Co-Authored-By: Claude Opus 4.5 --- README.md | 7 ++++- src/graphrelax/cli.py | 9 ++++++ src/graphrelax/idealize.py | 60 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 75 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 38e4c24..ddfe753 100644 --- a/README.md +++ b/README.md @@ -249,7 +249,12 @@ By default, GraphRelax idealizes backbone geometry before processing. This corre 4. Runs constrained minimization to relieve local strain 5. By default, closes chain breaks (gaps) in the structure -**Note:** By default, GraphRelax adds missing residues from SEQRES records during both relaxation and idealization. This may change residue numbering if the input PDB is missing N/C-terminal residues. Use `--ignore-missing-residues` to preserve original numbering for resfile compatibility. +**Important:** By default, GraphRelax adds missing residues from SEQRES records during idealization and renumbers all residues sequentially starting from 1 for each chain. This ensures consistent numbering regardless of the original PDB numbering scheme. + +If you're using a resfile, the residue numbers must match the **idealized** structure numbering (sequential from 1), not the original PDB numbering. To preserve original numbering for resfile compatibility, use one of these options: + +- `--ignore-missing-residues`: Keep original numbering, don't add missing residues +- `--no-idealize`: Skip idealization entirely (preserves original geometry and numbering) ```bash # Default: idealization is enabled diff --git a/src/graphrelax/cli.py b/src/graphrelax/cli.py index 532cd35..1f22fa1 100644 --- a/src/graphrelax/cli.py +++ b/src/graphrelax/cli.py @@ -456,6 +456,15 @@ def main(args=None) -> int: else: logger.info("Ligands auto-detected, using openmmforcefields") + # Warn about resfile + idealization interaction + if opts.resfile and not opts.no_idealize: + logger.warning( + "Using resfile with idealization enabled. Residue numbers in " + "the resfile should match the idealized structure (sequential " + "numbering starting from 1). Use --no-idealize or " + "--ignore-missing-residues to preserve original numbering." + ) + logger.info(f"Running GraphRelax in {mode.value} mode") logger.info(f"Input: {opts.input}") logger.info(f"Output: {opts.output}") diff --git a/src/graphrelax/idealize.py b/src/graphrelax/idealize.py index 3ce8336..efcc280 100644 --- a/src/graphrelax/idealize.py +++ b/src/graphrelax/idealize.py @@ -615,5 +615,65 @@ def idealize_structure( # Step 7: Restore ligands final_pdb = restore_ligands(minimized_pdb, ligand_lines) + # Step 8: Renumber residues sequentially per chain + # This fixes issues where pdbfixer assigns non-sequential numbers + final_pdb = renumber_residues_sequential(final_pdb) + logger.info("Structure idealization complete") return final_pdb, gaps + + +def renumber_residues_sequential(pdb_string: str) -> str: + """ + Renumber residues sequentially starting from 1 for each chain. + + This fixes issues where pdbfixer assigns non-sequential residue numbers + when adding missing residues. Sequential numbering is required for + proper chain gap detection and LigandMPNN processing. + + HETATM records are renumbered to continue after the last ATOM residue + in each chain. + + Args: + pdb_string: PDB file contents + + Returns: + PDB string with sequential residue numbering + """ + lines = pdb_string.split("\n") + output_lines = [] + + # Track residue numbering per chain + chain_residue_count = {} # chain_id -> next_resnum + residue_map = {} # (chain_id, old_resnum, icode) -> new_resnum + + # First pass: assign new numbers to unique residues + for line in lines: + if line.startswith("ATOM") or line.startswith("HETATM"): + chain_id = line[21] + old_resnum = line[22:26].strip() + icode = line[26] + key = (chain_id, old_resnum, icode) + + if key not in residue_map: + if chain_id not in chain_residue_count: + chain_residue_count[chain_id] = 1 + residue_map[key] = chain_residue_count[chain_id] + chain_residue_count[chain_id] += 1 + + # Second pass: apply new numbers + for line in lines: + if line.startswith("ATOM") or line.startswith("HETATM"): + chain_id = line[21] + old_resnum = line[22:26].strip() + icode = line[26] + key = (chain_id, old_resnum, icode) + + new_resnum = residue_map[key] + # Format residue number right-justified in columns 23-26 + new_line = line[:22] + f"{new_resnum:>4}" + " " + line[27:] + output_lines.append(new_line) + else: + output_lines.append(line) + + return "\n".join(output_lines) From 7368829bcd23b58ab0224bc61aed40c4810cc5ad Mon Sep 17 00:00:00 2001 From: delalamo Date: Tue, 13 Jan 2026 13:17:58 -0500 Subject: [PATCH 17/22] Include ligands in unconstrained minimization Fix bug where ligands were extracted before the ligand-presence check, causing the ligand-aware minimization path to never be triggered. Now for unconstrained minimization with ligands: - Ligands are detected before any extraction - Full PDB (with ligands) is passed to _relax_unconstrained() - Ligands are parameterized via openmmforcefields and minimized together with the protein For constrained minimization, ligands are still extracted and restored unchanged since AmberRelaxation cannot handle arbitrary ligands. This fixes protein-ligand clashes that occurred when the protein moved during minimization while ligands stayed in their original positions. Co-Authored-By: Claude Opus 4.5 --- src/graphrelax/relaxer.py | 50 +++++++++++++++++++++++++++++++++++---- 1 file changed, 46 insertions(+), 4 deletions(-) diff --git a/src/graphrelax/relaxer.py b/src/graphrelax/relaxer.py index 1e3955d..898df2c 100644 --- a/src/graphrelax/relaxer.py +++ b/src/graphrelax/relaxer.py @@ -77,9 +77,12 @@ def relax(self, pdb_string: str) -> Tuple[str, dict, np.ndarray]: If split_chains_at_gaps is enabled, chains will be split at detected gaps before minimization to prevent artificial gap closure. - Ligands (non-water HETATM records) are extracted before relaxation - and restored afterward, since standard AMBER force fields cannot - parameterize arbitrary ligands. + For unconstrained minimization with ligands present, ligands are + parameterized using openmmforcefields and included in the minimization. + + For constrained minimization, ligands are extracted before relaxation + and restored afterward (unchanged) since AmberRelaxation cannot + handle arbitrary ligands. Args: pdb_string: PDB file contents as string @@ -87,7 +90,46 @@ def relax(self, pdb_string: str) -> Tuple[str, dict, np.ndarray]: Returns: Tuple of (relaxed_pdb_string, debug_info, violations) """ - # Extract ligands before relaxation (AMBER can't parameterize them) + # Check if ligands are present (non-water HETATM records) + has_ligands = any( + line.startswith("HETATM") + and line[17:20].strip() not in WATER_RESIDUES + for line in pdb_string.split("\n") + ) + + # For unconstrained minimization with ligands, use the ligand-aware path + # which includes ligands in the minimization via openmmforcefields + if not self.config.constrained and has_ligands: + if not self.config.ignore_ligands: + # Detect and handle chain gaps on protein portion only + chain_mapping = {} + if self.config.split_chains_at_gaps: + protein_pdb, _ = extract_ligands(pdb_string) + gaps = detect_chain_gaps(protein_pdb) + if gaps: + logger.info(get_gap_summary(gaps)) + # Split the full PDB (with ligands) at gaps + pdb_string, chain_mapping = split_chains_at_gaps( + pdb_string, gaps + ) + + # Run unconstrained minimization with ligands included + relaxed_pdb, debug_info, violations = self._relax_unconstrained( + pdb_string + ) + + # Restore original chain IDs if chains were split + if chain_mapping: + relaxed_pdb = restore_chain_ids(relaxed_pdb, chain_mapping) + debug_info["chains_split"] = True + debug_info["gaps_detected"] = len( + [k for k, v in chain_mapping.items() if k != v] + ) + + return relaxed_pdb, debug_info, violations + + # For constrained minimization or when ignoring ligands: + # Extract ligands, relax protein-only, restore ligands protein_pdb, ligand_lines = extract_ligands(pdb_string) if ligand_lines.strip(): logger.debug( From bad8cfcf87131bed9f5bff44017dbeea6de790b1 Mon Sep 17 00:00:00 2001 From: delalamo Date: Tue, 13 Jan 2026 13:22:00 -0500 Subject: [PATCH 18/22] Include ligands in idealization minimization Update idealization pipeline to properly handle ligands: 1. Add missing residues and atoms (protein only) 2. Idealize bond lengths and angles 3. Minimize protein with constraints (without ligands) 4. Reintroduce ligands and minimize protein+ligand complex together This ensures ligands move with the protein during idealization rather than staying at their original coordinates while the protein moves. New function minimize_with_ligands() uses openmmforcefields to parameterize ligands and perform constrained minimization on the full protein-ligand complex. Co-Authored-By: Claude Opus 4.5 --- src/graphrelax/idealize.py | 207 +++++++++++++++++++++++++++++++++++-- src/graphrelax/pipeline.py | 5 +- 2 files changed, 200 insertions(+), 12 deletions(-) diff --git a/src/graphrelax/idealize.py b/src/graphrelax/idealize.py index efcc280..80ac97e 100644 --- a/src/graphrelax/idealize.py +++ b/src/graphrelax/idealize.py @@ -28,8 +28,22 @@ split_chains_at_gaps, ) from graphrelax.config import IdealizeConfig +from graphrelax.ligand_utils import ( + create_openff_molecule, + extract_ligands_from_pdb, + ligand_pdb_to_topology, +) from graphrelax.utils import check_gpu_available +# openmmforcefields is only available via conda-forge +try: + from openmmforcefields.generators import SystemGenerator + + OPENMMFF_AVAILABLE = True +except ImportError: + OPENMMFF_AVAILABLE = False + SystemGenerator = None + # Add vendored LigandMPNN to path for OpenFold imports LIGANDMPNN_PATH = Path(__file__).parent / "LigandMPNN" if str(LIGANDMPNN_PATH) not in sys.path: @@ -505,6 +519,143 @@ def minimize_with_constraints( return output.getvalue() +def minimize_with_ligands( + pdb_string: str, + stiffness: float = 10.0, + ligand_forcefield: str = "openff-2.0.0", + ligand_smiles: dict = None, +) -> str: + """ + Run constrained minimization with ligand support via openmmforcefields. + + This function is used as a final step in idealization when ligands are + present. It minimizes the protein-ligand complex with position restraints + on all heavy atoms. + + Args: + pdb_string: PDB file contents (protein + ligands) + stiffness: Restraint force constant in kcal/mol/A^2 + ligand_forcefield: Force field for ligands (openff-2.0.0, gaff-2.11) + ligand_smiles: Optional dict mapping resname -> SMILES + + Returns: + Minimized PDB string + + Raises: + ImportError: If openmmforcefields is not available. + """ + if not OPENMMFF_AVAILABLE: + raise ImportError( + "Ligand minimization requires openmmforcefields.\n" + "Install with: conda install -c conda-forge openmmforcefields" + ) + + ENERGY = unit.kilocalories_per_mole + LENGTH = unit.angstroms + + logger.debug( + f"Running constrained minimization with ligands " + f"(stiffness={stiffness}, forcefield={ligand_forcefield})" + ) + + # Separate protein and ligands + protein_pdb, ligands = extract_ligands_from_pdb(pdb_string) + ligand_names = [lig.resname for lig in ligands] + logger.debug(f"Found {len(ligands)} ligand(s): {ligand_names}") + + # Fix protein with pdbfixer (without ligands) + fixer = PDBFixer(pdbfile=io.StringIO(protein_pdb)) + fixer.findMissingResidues() + fixer.missingResidues = {} # Don't add missing residues at this stage + fixer.findMissingAtoms() + fixer.addMissingAtoms() + + # Create OpenFF molecules for ligands + user_smiles = ligand_smiles or {} + openff_molecules = [] + for ligand in ligands: + smiles = user_smiles.get(ligand.resname) + try: + mol = create_openff_molecule(ligand, smiles=smiles) + openff_molecules.append(mol) + except Exception as e: + logger.warning( + f"Could not parameterize ligand {ligand.resname}: {e}. " + "Ligand will be excluded from minimization." + ) + continue + + # Create combined topology using Modeller + modeller = openmm_app.Modeller(fixer.topology, fixer.positions) + + # Add ligands back to modeller + for ligand in ligands: + ligand_topology, ligand_positions = ligand_pdb_to_topology(ligand) + modeller.add(ligand_topology, ligand_positions) + + # Create SystemGenerator with ligand support + system_generator = SystemGenerator( + forcefields=["amber/ff14SB.xml"], + small_molecule_forcefield=ligand_forcefield, + molecules=openff_molecules, + forcefield_kwargs={ + "constraints": openmm_app.HBonds, + "removeCMMotion": True, + }, + ) + + # Add hydrogens + modeller.addHydrogens(system_generator.forcefield) + + # Create system + system = system_generator.create_system(modeller.topology) + + # Add position restraints to all heavy atoms + restraint_force = openmm.CustomExternalForce( + "0.5 * k * ((x-x0)^2 + (y-y0)^2 + (z-z0)^2)" + ) + stiffness_openmm = (stiffness * ENERGY / (LENGTH**2)).value_in_unit( + unit.kilojoules_per_mole / unit.nanometers**2 + ) + restraint_force.addGlobalParameter("k", stiffness_openmm) + for p in ["x0", "y0", "z0"]: + restraint_force.addPerParticleParameter(p) + + for i, atom in enumerate(modeller.topology.atoms()): + if atom.element.name != "hydrogen": + pos = modeller.positions[i].value_in_unit(unit.nanometers) + restraint_force.addParticle(i, pos) + + system.addForce(restraint_force) + logger.debug( + f"Added restraints to {restraint_force.getNumParticles()} heavy atoms" + ) + + # Check for GPU + use_gpu = check_gpu_available() + + # Create simulation and minimize + integrator = openmm.LangevinIntegrator(0, 0.01, 0.0) + platform = Platform.getPlatformByName("CUDA" if use_gpu else "CPU") + simulation = openmm_app.Simulation( + modeller.topology, system, integrator, platform + ) + simulation.context.setPositions(modeller.positions) + + # Minimize + simulation.minimizeEnergy() + + # Get minimized structure + state = simulation.context.getState(getPositions=True) + output = io.StringIO() + openmm_app.PDBFile.writeFile( + simulation.topology, state.getPositions(), output, keepIds=True + ) + + logger.debug("Constrained minimization with ligands complete") + return output.getvalue() + + def _idealize_chain_segment( residues: list, config: IdealizeConfig, @@ -548,6 +699,8 @@ def _idealize_chain_segment( def idealize_structure( pdb_string: str, config: IdealizeConfig, + ligand_forcefield: str = "openff-2.0.0", + ligand_smiles: dict = None, ) -> Tuple[str, List[ChainGap]]: """ Idealize backbone geometry while preserving dihedral angles. @@ -559,13 +712,17 @@ def idealize_structure( 2. Detect chain gaps 3. Split chains at gaps 4. Extract dihedrals and optionally correct cis-omega - 5. Run constrained minimization to relieve local strain - 6. Restore original chain IDs - 7. Restore ligands + 5. Add missing residues and atoms (protein only) + 6. Run constrained minimization on protein only + 7. Restore original chain IDs + 8. Reintroduce ligands and minimize protein+ligand complex + 9. Renumber residues sequentially Args: pdb_string: Input PDB file contents config: Idealization configuration + ligand_forcefield: Force field for ligand parameterization + ligand_smiles: Optional dict mapping resname -> SMILES Returns: Tuple of (idealized_pdb_string, list_of_chain_gaps) @@ -574,7 +731,8 @@ def idealize_structure( # Step 1: Extract ligands protein_pdb, ligand_lines = extract_ligands(pdb_string) - if ligand_lines.strip(): + has_ligands = bool(ligand_lines.strip()) + if has_ligands: logger.info("Extracted ligands for separate handling") # Step 2: Detect chain gaps (only if we want to retain them) @@ -599,23 +757,50 @@ def idealize_structure( if residues: _idealize_chain_segment(residues, config) - # Step 5: Run constrained minimization - # This is the key step - it fixes local geometry issues while - # keeping the overall structure close to the original + # Step 5-6: Add missing residues/atoms and run constrained minimization + # (protein only - ligands are not included yet) minimized_pdb = minimize_with_constraints( protein_pdb, stiffness=config.post_idealize_stiffness, add_missing_residues=config.add_missing_residues, ) - # Step 6: Restore original chain IDs + # Step 7: Restore original chain IDs if chain_mapping: minimized_pdb = restore_chain_ids(minimized_pdb, chain_mapping) - # Step 7: Restore ligands - final_pdb = restore_ligands(minimized_pdb, ligand_lines) + # Step 8: Reintroduce ligands and minimize protein+ligand complex + if has_ligands: + # Restore ligands to the minimized protein + combined_pdb = restore_ligands(minimized_pdb, ligand_lines) + + # Run a second minimization with ligands included + # This allows protein-ligand interactions to relax together + if OPENMMFF_AVAILABLE: + logger.info("Running final minimization with ligands included") + try: + final_pdb = minimize_with_ligands( + combined_pdb, + stiffness=config.post_idealize_stiffness, + ligand_forcefield=ligand_forcefield, + ligand_smiles=ligand_smiles, + ) + except Exception as e: + logger.warning( + f"Ligand minimization failed: {e}. " + "Ligands restored without minimization." + ) + final_pdb = combined_pdb + else: + logger.warning( + "openmmforcefields not available. " + "Ligands restored without minimization." + ) + final_pdb = combined_pdb + else: + final_pdb = minimized_pdb - # Step 8: Renumber residues sequentially per chain + # Step 9: Renumber residues sequentially per chain # This fixes issues where pdbfixer assigns non-sequential numbers final_pdb = renumber_residues_sequential(final_pdb) diff --git a/src/graphrelax/pipeline.py b/src/graphrelax/pipeline.py index f37b5da..7ce3193 100644 --- a/src/graphrelax/pipeline.py +++ b/src/graphrelax/pipeline.py @@ -205,7 +205,10 @@ def _run_single_output( if self.config.idealize.enabled: logger.info("Running pre-idealization...") current_pdb, gaps = idealize_structure( - current_pdb, self.config.idealize + current_pdb, + self.config.idealize, + ligand_forcefield=self.config.relax.ligand_forcefield, + ligand_smiles=self.config.relax.ligand_smiles, ) if gaps: logger.info( From 84d6bdc65cfabcab5c8b40121d2005fe70c9dbd3 Mon Sep 17 00:00:00 2001 From: delalamo Date: Tue, 13 Jan 2026 13:30:51 -0500 Subject: [PATCH 19/22] Raise error for unparameterizable metallocofactors When ligands containing transition metals (heme, Fe-S clusters, chlorophylls, etc.) are detected, raise a clear error explaining the options instead of attempting to parameterize them. - Add UNPARAMETERIZABLE_COFACTORS set in ligand_utils.py - Add is_unparameterizable_cofactor() check function - Update relaxer and idealize to fail early with helpful message Co-Authored-By: Claude Opus 4.5 --- src/graphrelax/idealize.py | 22 ++++++++++++++-- src/graphrelax/ligand_utils.py | 47 ++++++++++++++++++++++++++++++++++ src/graphrelax/relaxer.py | 16 ++++++++++++ 3 files changed, 83 insertions(+), 2 deletions(-) diff --git a/src/graphrelax/idealize.py b/src/graphrelax/idealize.py index 80ac97e..dd7e768 100644 --- a/src/graphrelax/idealize.py +++ b/src/graphrelax/idealize.py @@ -31,6 +31,7 @@ from graphrelax.ligand_utils import ( create_openff_molecule, extract_ligands_from_pdb, + is_unparameterizable_cofactor, ligand_pdb_to_topology, ) from graphrelax.utils import check_gpu_available @@ -560,8 +561,24 @@ def minimize_with_ligands( # Separate protein and ligands protein_pdb, ligands = extract_ligands_from_pdb(pdb_string) + + # Check for unparameterizable cofactors and fail early + for lig in ligands: + if is_unparameterizable_cofactor(lig.resname): + raise ValueError( + f"Cannot parameterize cofactor '{lig.resname}' " + f"(chain {lig.chain_id}, res {lig.resnum}). " + f"Metallocofactors like heme, Fe-S clusters, and chlorophylls " + f"cannot be parameterized by standard force fields.\n\n" + f"Options:\n" + f" 1. Remove the cofactor from the input PDB\n" + f" 2. Use --ignore-ligands to exclude all ligands\n" + f" 3. Use --no-idealize to skip idealization" + ) + ligand_names = [lig.resname for lig in ligands] - logger.debug(f"Found {len(ligands)} ligand(s): {ligand_names}") + if ligands: + logger.debug(f"Found {len(ligands)} ligand(s): {ligand_names}") # Fix protein with pdbfixer (without ligands) fixer = PDBFixer(pdbfile=io.StringIO(protein_pdb)) @@ -651,9 +668,10 @@ def minimize_with_ligands( openmm_app.PDBFile.writeFile( simulation.topology, state.getPositions(), output, keepIds=True ) + result = output.getvalue() logger.debug("Constrained minimization with ligands complete") - return output.getvalue() + return result def _idealize_chain_segment( diff --git a/src/graphrelax/ligand_utils.py b/src/graphrelax/ligand_utils.py index c774e07..0849d01 100644 --- a/src/graphrelax/ligand_utils.py +++ b/src/graphrelax/ligand_utils.py @@ -141,6 +141,53 @@ def get_ion_smiles() -> Dict[str, str]: } +# Cofactors that contain metals or unusual chemistry that cannot be +# parameterized by standard force fields (GAFF, OpenFF, etc.) +# These ligands will be excluded from minimization and restored unchanged. +UNPARAMETERIZABLE_COFACTORS = { + # Heme and porphyrins (contain Fe) + "HEM", + "HEC", # Heme C + "HEA", # Heme A + "HEB", # Heme B + "1HE", # Heme variant + "2HE", + "DHE", # Deuteroheme + "HAS", # Heme-AS + "HDD", # Hydroxyheme + "HEO", # Heme O + "HNI", # Heme N + "SRM", # Siroheme + # Iron-sulfur clusters + "SF4", # 4Fe-4S cluster + "FES", # 2Fe-2S cluster + "F3S", # 3Fe-4S cluster + # Other metallocofactors + "CLA", # Chlorophyll A + "CLB", # Chlorophyll B + "BCL", # Bacteriochlorophyll + "BPH", # Bacteriopheophytin + "PHO", # Pheophytin + "CHL", # Chlorophyll + "B12", # Vitamin B12 / Cobalamin + "COB", # Cobalamin + "PQQ", # Pyrroloquinoline quinone + "MTE", # Methanopterin + "F43", # Coenzyme F430 (Ni) + "MO7", # Molybdopterin + "MGD", # Molybdopterin guanine dinucleotide + # Copper centers + "CU1", # Copper site + "CUA", # CuA center + "CUB", # CuB center +} + + +def is_unparameterizable_cofactor(resname: str) -> bool: + """Check if a residue is a known unparameterizable cofactor.""" + return resname.upper() in UNPARAMETERIZABLE_COFACTORS + + def is_single_atom_ligand(ligand: LigandInfo) -> bool: """Check if ligand is a single atom (ion).""" return len(ligand.pdb_lines) == 1 diff --git a/src/graphrelax/relaxer.py b/src/graphrelax/relaxer.py index 898df2c..4067120 100644 --- a/src/graphrelax/relaxer.py +++ b/src/graphrelax/relaxer.py @@ -23,6 +23,7 @@ from graphrelax.ligand_utils import ( create_openff_molecule, extract_ligands_from_pdb, + is_unparameterizable_cofactor, ligand_pdb_to_topology, ) from graphrelax.utils import check_gpu_available @@ -369,6 +370,21 @@ def _relax_unconstrained_with_ligands( # Step 1: Separate protein and ligands protein_pdb, ligands = extract_ligands_from_pdb(pdb_string) + + # Check for unparameterizable cofactors and fail early + for lig in ligands: + if is_unparameterizable_cofactor(lig.resname): + raise ValueError( + f"Cannot parameterize cofactor '{lig.resname}' " + f"(chain {lig.chain_id}, res {lig.resnum}). " + f"Metallocofactors like heme, Fe-S clusters, and " + f"chlorophylls cannot be parameterized.\n\n" + f"Options:\n" + f" 1. Remove the cofactor from the input PDB\n" + f" 2. Use --ignore-ligands to exclude all ligands\n" + f" 3. Use --constrained-minimization (protein-only)" + ) + ligand_names = [lig.resname for lig in ligands] logger.info(f"Found {len(ligands)} ligand(s): {ligand_names}") From bc5394765dc74bffedadb339e398d8d57e3e3a1b Mon Sep 17 00:00:00 2001 From: delalamo Date: Tue, 13 Jan 2026 13:36:31 -0500 Subject: [PATCH 20/22] Remove silent error handling for ligand parameterization Don't silently catch and continue when ligand parameterization fails during idealization - let it fail with a clear error message instead of failing later during relaxation. Co-Authored-By: Claude Opus 4.5 --- src/graphrelax/idealize.py | 28 ++++++++++------------------ 1 file changed, 10 insertions(+), 18 deletions(-) diff --git a/src/graphrelax/idealize.py b/src/graphrelax/idealize.py index dd7e768..0aab84a 100644 --- a/src/graphrelax/idealize.py +++ b/src/graphrelax/idealize.py @@ -793,28 +793,20 @@ def idealize_structure( combined_pdb = restore_ligands(minimized_pdb, ligand_lines) # Run a second minimization with ligands included - # This allows protein-ligand interactions to relax together if OPENMMFF_AVAILABLE: logger.info("Running final minimization with ligands included") - try: - final_pdb = minimize_with_ligands( - combined_pdb, - stiffness=config.post_idealize_stiffness, - ligand_forcefield=ligand_forcefield, - ligand_smiles=ligand_smiles, - ) - except Exception as e: - logger.warning( - f"Ligand minimization failed: {e}. " - "Ligands restored without minimization." - ) - final_pdb = combined_pdb + final_pdb = minimize_with_ligands( + combined_pdb, + stiffness=config.post_idealize_stiffness, + ligand_forcefield=ligand_forcefield, + ligand_smiles=ligand_smiles, + ) else: - logger.warning( - "openmmforcefields not available. " - "Ligands restored without minimization." + raise ImportError( + "Ligand minimization requires openmmforcefields.\n" + "Install with: conda install -c conda-forge openmmforcefields\n" + "Or use --ignore-ligands to exclude ligands." ) - final_pdb = combined_pdb else: final_pdb = minimized_pdb From f586cc73b4028826e62a07d0acb71f9667bc77a3 Mon Sep 17 00:00:00 2001 From: delalamo Date: Tue, 13 Jan 2026 13:39:54 -0500 Subject: [PATCH 21/22] Don't silently skip ligands that fail parameterization If create_openff_molecule() fails, let the error propagate rather than silently skipping the ligand and then failing later. Co-Authored-By: Claude Opus 4.5 --- src/graphrelax/idealize.py | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/src/graphrelax/idealize.py b/src/graphrelax/idealize.py index 0aab84a..0da9cf0 100644 --- a/src/graphrelax/idealize.py +++ b/src/graphrelax/idealize.py @@ -592,15 +592,8 @@ def minimize_with_ligands( openff_molecules = [] for ligand in ligands: smiles = user_smiles.get(ligand.resname) - try: - mol = create_openff_molecule(ligand, smiles=smiles) - openff_molecules.append(mol) - except Exception as e: - logger.warning( - f"Could not parameterize ligand {ligand.resname}: {e}. " - "Ligand will be excluded from minimization." - ) - continue + mol = create_openff_molecule(ligand, smiles=smiles) + openff_molecules.append(mol) # Create combined topology using Modeller modeller = openmm_app.Modeller(fixer.topology, fixer.positions) From 4d8ec60c7d516d5b436a6ef8b11ab2594c5199bb Mon Sep 17 00:00:00 2001 From: delalamo Date: Tue, 13 Jan 2026 15:32:00 -0500 Subject: [PATCH 22/22] Simplify ligand handling: use exclusion zones instead of parameterization - Remove openmmforcefields dependency for ligand handling - Add ligand exclusion zone approach in relaxer.py: ligand atoms are added as massless dummy particles with LJ repulsion, preventing protein from moving into ligand space during minimization - Simplify idealize.py: ligands are extracted before protein minimization and restored afterward at original positions - Fix PyTorch deprecation warning in tensor_utils.py (use tuple for multidimensional indexing) - Add PDBe SMILES fetching for ligand identification (ligand_utils.py) - Remove complex ligand parameterization code that was failing due to PDB files lacking bond information for HETATM records This approach is more robust because: 1. No need for ligand force field parameters 2. Works with any ligand without SMILES 3. Protein minimizes fully while respecting ligand positions Co-Authored-By: Claude Opus 4.5 --- .../LigandMPNN/openfold/utils/tensor_utils.py | 2 +- src/graphrelax/cli.py | 10 + src/graphrelax/config.py | 1 + src/graphrelax/idealize.py | 197 +--------- src/graphrelax/ligand_utils.py | 145 ++++++- src/graphrelax/pipeline.py | 2 - src/graphrelax/relaxer.py | 365 ++++++++---------- tests/test_ligand_utils.py | 45 +++ 8 files changed, 357 insertions(+), 410 deletions(-) diff --git a/src/graphrelax/LigandMPNN/openfold/utils/tensor_utils.py b/src/graphrelax/LigandMPNN/openfold/utils/tensor_utils.py index ccc7e7a..bb6fc97 100644 --- a/src/graphrelax/LigandMPNN/openfold/utils/tensor_utils.py +++ b/src/graphrelax/LigandMPNN/openfold/utils/tensor_utils.py @@ -91,7 +91,7 @@ def batched_gather(data, inds, dim=0, no_batch_dims=0): ] remaining_dims[dim - no_batch_dims if dim >= 0 else dim] = inds ranges.extend(remaining_dims) - return data[ranges] + return data[tuple(ranges)] # With tree_map, a poor man's JAX tree_map diff --git a/src/graphrelax/cli.py b/src/graphrelax/cli.py index fe9ac0e..010b7c2 100644 --- a/src/graphrelax/cli.py +++ b/src/graphrelax/cli.py @@ -282,6 +282,15 @@ def create_parser() -> argparse.ArgumentParser: "Can be used multiple times. Example: --ligand-smiles LIG:SMILES" ), ) + relax_group.add_argument( + "--no-fetch-ligand-smiles", + action="store_true", + help=( + "Disable automatic SMILES lookup from PDBe Chemical Component " + "Dictionary. By default, if RDKit cannot infer bond topology from " + "coordinates, SMILES are fetched from PDBe." + ), + ) # Scoring options score_group = parser.add_argument_group("Scoring options") @@ -499,6 +508,7 @@ def main(args=None) -> int: ignore_ligands=opts.ignore_ligands, ligand_forcefield=opts.ligand_forcefield, ligand_smiles=ligand_smiles, + fetch_pdbe_smiles=not opts.no_fetch_ligand_smiles, ) # Build keep_residues set from --keep-ligand flag (comma-separated) diff --git a/src/graphrelax/config.py b/src/graphrelax/config.py index 6597ca2..2bb3ff3 100644 --- a/src/graphrelax/config.py +++ b/src/graphrelax/config.py @@ -55,6 +55,7 @@ class RelaxConfig: ligand_smiles: Dict[str, str] = field( default_factory=dict ) # {resname: SMILES} + fetch_pdbe_smiles: bool = True # Auto-fetch SMILES from PDBe CCD @dataclass diff --git a/src/graphrelax/idealize.py b/src/graphrelax/idealize.py index 0da9cf0..af7b8c0 100644 --- a/src/graphrelax/idealize.py +++ b/src/graphrelax/idealize.py @@ -28,23 +28,8 @@ split_chains_at_gaps, ) from graphrelax.config import IdealizeConfig -from graphrelax.ligand_utils import ( - create_openff_molecule, - extract_ligands_from_pdb, - is_unparameterizable_cofactor, - ligand_pdb_to_topology, -) from graphrelax.utils import check_gpu_available -# openmmforcefields is only available via conda-forge -try: - from openmmforcefields.generators import SystemGenerator - - OPENMMFF_AVAILABLE = True -except ImportError: - OPENMMFF_AVAILABLE = False - SystemGenerator = None - # Add vendored LigandMPNN to path for OpenFold imports LIGANDMPNN_PATH = Path(__file__).parent / "LigandMPNN" if str(LIGANDMPNN_PATH) not in sys.path: @@ -520,153 +505,6 @@ def minimize_with_constraints( return output.getvalue() -def minimize_with_ligands( - pdb_string: str, - stiffness: float = 10.0, - ligand_forcefield: str = "openff-2.0.0", - ligand_smiles: dict = None, -) -> str: - """ - Run constrained minimization with ligand support via openmmforcefields. - - This function is used as a final step in idealization when ligands are - present. It minimizes the protein-ligand complex with position restraints - on all heavy atoms. - - Args: - pdb_string: PDB file contents (protein + ligands) - stiffness: Restraint force constant in kcal/mol/A^2 - ligand_forcefield: Force field for ligands (openff-2.0.0, gaff-2.11) - ligand_smiles: Optional dict mapping resname -> SMILES - - Returns: - Minimized PDB string - - Raises: - ImportError: If openmmforcefields is not available. - """ - if not OPENMMFF_AVAILABLE: - raise ImportError( - "Ligand minimization requires openmmforcefields.\n" - "Install with: conda install -c conda-forge openmmforcefields" - ) - - ENERGY = unit.kilocalories_per_mole - LENGTH = unit.angstroms - - logger.debug( - f"Running constrained minimization with ligands " - f"(stiffness={stiffness}, forcefield={ligand_forcefield})" - ) - - # Separate protein and ligands - protein_pdb, ligands = extract_ligands_from_pdb(pdb_string) - - # Check for unparameterizable cofactors and fail early - for lig in ligands: - if is_unparameterizable_cofactor(lig.resname): - raise ValueError( - f"Cannot parameterize cofactor '{lig.resname}' " - f"(chain {lig.chain_id}, res {lig.resnum}). " - f"Metallocofactors like heme, Fe-S clusters, and chlorophylls " - f"cannot be parameterized by standard force fields.\n\n" - f"Options:\n" - f" 1. Remove the cofactor from the input PDB\n" - f" 2. Use --ignore-ligands to exclude all ligands\n" - f" 3. Use --no-idealize to skip idealization" - ) - - ligand_names = [lig.resname for lig in ligands] - if ligands: - logger.debug(f"Found {len(ligands)} ligand(s): {ligand_names}") - - # Fix protein with pdbfixer (without ligands) - fixer = PDBFixer(pdbfile=io.StringIO(protein_pdb)) - fixer.findMissingResidues() - fixer.missingResidues = {} # Don't add missing residues at this stage - fixer.findMissingAtoms() - fixer.addMissingAtoms() - - # Create OpenFF molecules for ligands - user_smiles = ligand_smiles or {} - openff_molecules = [] - for ligand in ligands: - smiles = user_smiles.get(ligand.resname) - mol = create_openff_molecule(ligand, smiles=smiles) - openff_molecules.append(mol) - - # Create combined topology using Modeller - modeller = openmm_app.Modeller(fixer.topology, fixer.positions) - - # Add ligands back to modeller - for ligand in ligands: - ligand_topology, ligand_positions = ligand_pdb_to_topology(ligand) - modeller.add(ligand_topology, ligand_positions) - - # Create SystemGenerator with ligand support - system_generator = SystemGenerator( - forcefields=["amber/ff14SB.xml"], - small_molecule_forcefield=ligand_forcefield, - molecules=openff_molecules, - forcefield_kwargs={ - "constraints": openmm_app.HBonds, - "removeCMMotion": True, - }, - ) - - # Add hydrogens - modeller.addHydrogens(system_generator.forcefield) - - # Create system - system = system_generator.create_system(modeller.topology) - - # Add position restraints to all heavy atoms - restraint_force = openmm.CustomExternalForce( - "0.5 * k * ((x-x0)^2 + (y-y0)^2 + (z-z0)^2)" - ) - stiffness_openmm = (stiffness * ENERGY / (LENGTH**2)).value_in_unit( - unit.kilojoules_per_mole / unit.nanometers**2 - ) - restraint_force.addGlobalParameter("k", stiffness_openmm) - for p in ["x0", "y0", "z0"]: - restraint_force.addPerParticleParameter(p) - - for i, atom in enumerate(modeller.topology.atoms()): - if atom.element.name != "hydrogen": - pos = modeller.positions[i].value_in_unit(unit.nanometers) - restraint_force.addParticle(i, pos) - - system.addForce(restraint_force) - logger.debug( - f"Added restraints to {restraint_force.getNumParticles()} heavy atoms" - ) - - # Check for GPU - use_gpu = check_gpu_available() - - # Create simulation and minimize - integrator = openmm.LangevinIntegrator(0, 0.01, 0.0) - platform = Platform.getPlatformByName("CUDA" if use_gpu else "CPU") - simulation = openmm_app.Simulation( - modeller.topology, system, integrator, platform - ) - simulation.context.setPositions(modeller.positions) - - # Minimize - simulation.minimizeEnergy() - - # Get minimized structure - state = simulation.context.getState(getPositions=True) - output = io.StringIO() - openmm_app.PDBFile.writeFile( - simulation.topology, state.getPositions(), output, keepIds=True - ) - result = output.getvalue() - - logger.debug("Constrained minimization with ligands complete") - return result - - def _idealize_chain_segment( residues: list, config: IdealizeConfig, @@ -710,8 +548,6 @@ def _idealize_chain_segment( def idealize_structure( pdb_string: str, config: IdealizeConfig, - ligand_forcefield: str = "openff-2.0.0", - ligand_smiles: dict = None, ) -> Tuple[str, List[ChainGap]]: """ Idealize backbone geometry while preserving dihedral angles. @@ -726,14 +562,16 @@ def idealize_structure( 5. Add missing residues and atoms (protein only) 6. Run constrained minimization on protein only 7. Restore original chain IDs - 8. Reintroduce ligands and minimize protein+ligand complex + 8. Reintroduce ligands (ligands are kept at original positions) 9. Renumber residues sequentially + Note: Ligands are NOT minimized during idealization. They are extracted + before protein minimization and restored afterward at their original + positions. This avoids the need for ligand force field parameterization. + Args: pdb_string: Input PDB file contents config: Idealization configuration - ligand_forcefield: Force field for ligand parameterization - ligand_smiles: Optional dict mapping resname -> SMILES Returns: Tuple of (idealized_pdb_string, list_of_chain_gaps) @@ -780,26 +618,15 @@ def idealize_structure( if chain_mapping: minimized_pdb = restore_chain_ids(minimized_pdb, chain_mapping) - # Step 8: Reintroduce ligands and minimize protein+ligand complex + # Step 8: Reintroduce ligands if has_ligands: # Restore ligands to the minimized protein - combined_pdb = restore_ligands(minimized_pdb, ligand_lines) - - # Run a second minimization with ligands included - if OPENMMFF_AVAILABLE: - logger.info("Running final minimization with ligands included") - final_pdb = minimize_with_ligands( - combined_pdb, - stiffness=config.post_idealize_stiffness, - ligand_forcefield=ligand_forcefield, - ligand_smiles=ligand_smiles, - ) - else: - raise ImportError( - "Ligand minimization requires openmmforcefields.\n" - "Install with: conda install -c conda-forge openmmforcefields\n" - "Or use --ignore-ligands to exclude ligands." - ) + # Ligands are kept at their original positions (not minimized) + # This avoids the need for ligand force field parameterization + logger.info( + "Restoring ligands to minimized protein (ligands held fixed)" + ) + final_pdb = restore_ligands(minimized_pdb, ligand_lines) else: final_pdb = minimized_pdb diff --git a/src/graphrelax/ligand_utils.py b/src/graphrelax/ligand_utils.py index 0849d01..a369fdf 100644 --- a/src/graphrelax/ligand_utils.py +++ b/src/graphrelax/ligand_utils.py @@ -1,7 +1,10 @@ """Utilities for ligand parameterization and handling.""" import io +import json import logging +import urllib.error +import urllib.request from dataclasses import dataclass from typing import Dict, List, Optional, Tuple @@ -188,32 +191,102 @@ def is_unparameterizable_cofactor(resname: str) -> bool: return resname.upper() in UNPARAMETERIZABLE_COFACTORS +# In-memory cache for PDBe SMILES lookups +_PDBE_SMILES_CACHE = {} + + +def fetch_pdbe_smiles(resname: str) -> Optional[str]: + """ + Fetch SMILES from PDBe Chemical Component Dictionary. + + Args: + resname: Three-letter ligand code (e.g., "ATP", "3JD") + + Returns: + SMILES string if found, None otherwise + """ + resname_upper = resname.upper() + + # Check cache first + if resname_upper in _PDBE_SMILES_CACHE: + cached = _PDBE_SMILES_CACHE[resname_upper] + if cached is not None: + logger.debug(f"Using cached SMILES for {resname_upper}") + return cached + + url = f"https://www.ebi.ac.uk/pdbe/api/pdb/compound/summary/{resname_upper}" + + try: + with urllib.request.urlopen(url, timeout=5) as response: + data = json.loads(response.read().decode("utf-8")) + + if resname_upper in data and data[resname_upper]: + compound_data = data[resname_upper][0] + if "smiles" in compound_data and compound_data["smiles"]: + smiles = compound_data["smiles"][0]["name"] + _PDBE_SMILES_CACHE[resname_upper] = smiles + logger.info(f"Fetched SMILES for {resname_upper} from PDBe") + return smiles + + except urllib.error.HTTPError as e: + if e.code == 404: + logger.debug(f"Ligand {resname_upper} not found in PDBe CCD") + else: + logger.warning( + f"HTTP error fetching SMILES for {resname_upper}: {e}" + ) + except urllib.error.URLError as e: + logger.warning( + f"Network error fetching SMILES for {resname_upper}: {e}" + ) + except json.JSONDecodeError as e: + logger.warning(f"JSON parse error for {resname_upper}: {e}") + except Exception as e: + logger.warning( + f"Unexpected error fetching SMILES for {resname_upper}: {e}" + ) + + # Cache the failure too + _PDBE_SMILES_CACHE[resname_upper] = None + return None + + def is_single_atom_ligand(ligand: LigandInfo) -> bool: """Check if ligand is a single atom (ion).""" return len(ligand.pdb_lines) == 1 -def create_openff_molecule(ligand: LigandInfo, smiles: Optional[str] = None): +def create_openff_molecule( + ligand: LigandInfo, + smiles: Optional[str] = None, + fetch_pdbe: bool = True, +): """ Create an OpenFF Toolkit Molecule from ligand info. Attempts to create the molecule in this order: 1. From user-provided SMILES (if given) 2. From ion lookup table (for single-atom ligands) - 3. From PDB coordinates via RDKit bond perception - 4. From direct OpenFF PDB parsing + 3. From PDBe Chemical Component Dictionary (if fetch_pdbe=True) + 4. From PDB coordinates via RDKit bond perception (fallback) + + Note: PDBe lookup is preferred over RDKit because RDKit's bond perception + from 3D coordinates often fails or produces incorrect molecules for complex + organic ligands. PDBe has correct SMILES for all standard PDB ligands. Requires openff-toolkit and rdkit (install via conda-forge). Args: ligand: LigandInfo with PDB coordinates smiles: Optional SMILES string (overrides automatic detection) + fetch_pdbe: If True, try PDBe CCD before RDKit (recommended) Returns: openff.toolkit.Molecule Raises: ImportError: If openff-toolkit or rdkit are not installed. + ValueError: If molecule cannot be created by any method. """ _check_ligand_deps() @@ -221,6 +294,7 @@ def create_openff_molecule(ligand: LigandInfo, smiles: Optional[str] = None): if smiles: try: mol = Molecule.from_smiles(smiles, allow_undefined_stereo=True) + mol.name = ligand.resname # Required for openmmforcefields matching logger.debug(f"Created molecule for {ligand.resname} from SMILES") return mol except Exception as e: @@ -233,6 +307,7 @@ def create_openff_molecule(ligand: LigandInfo, smiles: Optional[str] = None): mol = Molecule.from_smiles( ion_smiles[ligand.resname], allow_undefined_stereo=True ) + mol.name = ligand.resname # Required for openmmforcefields matching logger.debug(f"Created ion molecule for {ligand.resname}") return mol else: @@ -241,27 +316,59 @@ def create_openff_molecule(ligand: LigandInfo, smiles: Optional[str] = None): f"ligand_smiles={{'{ligand.resname}': '[Element+charge]'}}" ) - # 3. Try RDKit bond perception from PDB coordinates + # 3. Try PDBe Chemical Component Dictionary first (most reliable) + if fetch_pdbe: + pdbe_smiles = fetch_pdbe_smiles(ligand.resname) + if pdbe_smiles: + try: + mol = Molecule.from_smiles( + pdbe_smiles, allow_undefined_stereo=True + ) + mol.name = ( + ligand.resname + ) # Required for openmmforcefields matching + logger.info( + f"Created molecule for {ligand.resname} using PDBe SMILES" + ) + return mol + except Exception as e: + logger.warning(f"Failed to create from PDBe SMILES: {e}") + + # 4. Fallback: Try RDKit bond perception from PDB coordinates + # Note: This often produces incorrect molecules for complex organic ligands pdb_block = "\n".join(ligand.pdb_lines) + "\nEND\n" + rdkit_error = None try: mol = _create_molecule_via_rdkit(pdb_block) + mol.name = ligand.resname # Required for openmmforcefields matching logger.debug(f"Created molecule for {ligand.resname} via RDKit") return mol except Exception as e: + rdkit_error = e logger.debug(f"RDKit parsing failed: {e}") - # 4. Fallback: direct OpenFF PDB parsing + # 5. Last resort: Try direct OpenFF PDB parsing + openff_error = None try: mol = Molecule.from_pdb_file( io.StringIO(pdb_block), allow_undefined_stereo=True, ) + mol.name = ligand.resname # Required for openmmforcefields matching logger.debug(f"Created molecule for {ligand.resname} from PDB") return mol except Exception as e: - raise ValueError( - f"Could not create molecule for ligand {ligand.resname}: {e}" - ) + openff_error = e + logger.debug(f"OpenFF PDB parsing failed: {e}") + + # All methods failed + raise ValueError( + f"Could not create molecule for ligand {ligand.resname}.\n" + f"RDKit error: {rdkit_error}\n" + f"OpenFF error: {openff_error}\n\n" + f"You can provide SMILES manually via --ligand-smiles " + f"'{ligand.resname}:YOUR_SMILES_HERE'" + ) def _create_molecule_via_rdkit(pdb_block: str): @@ -287,16 +394,34 @@ def _create_molecule_via_rdkit(pdb_block: str): return Molecule.from_rdkit(mol, allow_undefined_stereo=True) -def ligand_pdb_to_topology(ligand: LigandInfo): +def ligand_pdb_to_topology(ligand: LigandInfo, strip_hydrogens: bool = False): """ Convert ligand PDB lines to OpenMM topology and positions. Args: ligand: LigandInfo with PDB lines + strip_hydrogens: If True, remove hydrogen atoms from the topology. + Useful when hydrogens will be added later by addHydrogens(). Returns: Tuple of (topology, positions) """ - pdb_block = "\n".join(ligand.pdb_lines) + "\nEND\n" + if strip_hydrogens: + # Filter out hydrogen atoms (element column is at position 77-78) + heavy_atom_lines = [] + for line in ligand.pdb_lines: + if len(line) >= 78: + element = line[76:78].strip() + if element != "H": + heavy_atom_lines.append(line) + else: + # Try to detect hydrogen from atom name (starts with H) + atom_name = line[12:16].strip() if len(line) >= 16 else "" + if not atom_name.startswith("H"): + heavy_atom_lines.append(line) + pdb_block = "\n".join(heavy_atom_lines) + "\nEND\n" + else: + pdb_block = "\n".join(ligand.pdb_lines) + "\nEND\n" + pdb_file = openmm_app.PDBFile(io.StringIO(pdb_block)) return pdb_file.topology, pdb_file.positions diff --git a/src/graphrelax/pipeline.py b/src/graphrelax/pipeline.py index 7ce3193..d7c258d 100644 --- a/src/graphrelax/pipeline.py +++ b/src/graphrelax/pipeline.py @@ -207,8 +207,6 @@ def _run_single_output( current_pdb, gaps = idealize_structure( current_pdb, self.config.idealize, - ligand_forcefield=self.config.relax.ligand_forcefield, - ligand_smiles=self.config.relax.ligand_smiles, ) if gaps: logger.info( diff --git a/src/graphrelax/relaxer.py b/src/graphrelax/relaxer.py index 4067120..9359def 100644 --- a/src/graphrelax/relaxer.py +++ b/src/graphrelax/relaxer.py @@ -4,14 +4,13 @@ import logging import sys from pathlib import Path -from typing import Tuple +from typing import Optional, Tuple import numpy as np +from openmm import Platform from openmm import app as openmm_app from openmm import openmm, unit -from pdbfixer import PDBFixer -from graphrelax.artifacts import WATER_RESIDUES from graphrelax.chain_gaps import ( detect_chain_gaps, get_gap_summary, @@ -20,35 +19,6 @@ ) from graphrelax.config import RelaxConfig from graphrelax.idealize import extract_ligands, restore_ligands -from graphrelax.ligand_utils import ( - create_openff_molecule, - extract_ligands_from_pdb, - is_unparameterizable_cofactor, - ligand_pdb_to_topology, -) -from graphrelax.utils import check_gpu_available - -# openmmforcefields is only available via conda-forge -# Install with: conda install -c conda-forge openmmforcefields=0.13.0 -try: - from openmmforcefields.generators import SystemGenerator - - OPENMMFF_AVAILABLE = True -except ImportError: - OPENMMFF_AVAILABLE = False - SystemGenerator = None - - -def _check_openmmff(): - """Raise ImportError if openmmforcefields is not available.""" - if not OPENMMFF_AVAILABLE: - raise ImportError( - "Ligand support requires openmmforcefields.\n" - "This package is only available via conda-forge:\n\n" - " conda install -c conda-forge openmmforcefields=0.13.0\n\n" - "See the README for full installation instructions." - ) - # Add vendored LigandMPNN to path for OpenFold imports # Must happen before importing from openfold @@ -67,6 +37,22 @@ class Relaxer: def __init__(self, config: RelaxConfig): self.config = config + self._use_gpu: Optional[bool] = None + + def _check_gpu_available(self) -> bool: + """Check if CUDA is available for OpenMM.""" + if self._use_gpu is not None: + return self._use_gpu + + for i in range(Platform.getNumPlatforms()): + if Platform.getPlatform(i).getName() == "CUDA": + self._use_gpu = True + logger.info("OpenMM CUDA platform detected, using GPU") + return True + + self._use_gpu = False + logger.info("OpenMM CUDA not available, using CPU") + return False def relax(self, pdb_string: str) -> Tuple[str, dict, np.ndarray]: """ @@ -78,12 +64,9 @@ def relax(self, pdb_string: str) -> Tuple[str, dict, np.ndarray]: If split_chains_at_gaps is enabled, chains will be split at detected gaps before minimization to prevent artificial gap closure. - For unconstrained minimization with ligands present, ligands are - parameterized using openmmforcefields and included in the minimization. - - For constrained minimization, ligands are extracted before relaxation - and restored afterward (unchanged) since AmberRelaxation cannot - handle arbitrary ligands. + Ligands (non-water HETATM records) are extracted before relaxation + and restored afterward. Protein atoms near ligands are restrained + to prevent clashes when ligands are restored. Args: pdb_string: PDB file contents as string @@ -91,51 +74,15 @@ def relax(self, pdb_string: str) -> Tuple[str, dict, np.ndarray]: Returns: Tuple of (relaxed_pdb_string, debug_info, violations) """ - # Check if ligands are present (non-water HETATM records) - has_ligands = any( - line.startswith("HETATM") - and line[17:20].strip() not in WATER_RESIDUES - for line in pdb_string.split("\n") - ) - - # For unconstrained minimization with ligands, use the ligand-aware path - # which includes ligands in the minimization via openmmforcefields - if not self.config.constrained and has_ligands: - if not self.config.ignore_ligands: - # Detect and handle chain gaps on protein portion only - chain_mapping = {} - if self.config.split_chains_at_gaps: - protein_pdb, _ = extract_ligands(pdb_string) - gaps = detect_chain_gaps(protein_pdb) - if gaps: - logger.info(get_gap_summary(gaps)) - # Split the full PDB (with ligands) at gaps - pdb_string, chain_mapping = split_chains_at_gaps( - pdb_string, gaps - ) - - # Run unconstrained minimization with ligands included - relaxed_pdb, debug_info, violations = self._relax_unconstrained( - pdb_string - ) - - # Restore original chain IDs if chains were split - if chain_mapping: - relaxed_pdb = restore_chain_ids(relaxed_pdb, chain_mapping) - debug_info["chains_split"] = True - debug_info["gaps_detected"] = len( - [k for k, v in chain_mapping.items() if k != v] - ) - - return relaxed_pdb, debug_info, violations - - # For constrained minimization or when ignoring ligands: - # Extract ligands, relax protein-only, restore ligands + # Extract ligands before relaxation (AMBER can't parameterize them) protein_pdb, ligand_lines = extract_ligands(pdb_string) + ligand_coords = None if ligand_lines.strip(): logger.debug( "Extracted ligands for separate handling during relaxation" ) + # Parse ligand coordinates to restrain nearby protein atoms + ligand_coords = self._parse_ligand_coords(ligand_lines) # Detect and handle chain gaps if configured chain_mapping = {} @@ -152,7 +99,7 @@ def relax(self, pdb_string: str) -> Tuple[str, dict, np.ndarray]: relaxed_pdb, debug_info, violations = self.relax_protein(prot) else: relaxed_pdb, debug_info, violations = self._relax_unconstrained( - protein_pdb + protein_pdb, ligand_coords=ligand_coords ) # Restore original chain IDs if chains were split @@ -168,6 +115,20 @@ def relax(self, pdb_string: str) -> Tuple[str, dict, np.ndarray]: return relaxed_pdb, debug_info, violations + def _parse_ligand_coords(self, ligand_lines: str) -> np.ndarray: + """Parse ligand atom coordinates from HETATM lines.""" + coords = [] + for line in ligand_lines.split("\n"): + if line.startswith("HETATM"): + try: + x = float(line[30:38]) + y = float(line[38:46]) + z = float(line[46:54]) + coords.append([x, y, z]) + except (ValueError, IndexError): + continue + return np.array(coords) if coords else None + def relax_pdb_file(self, pdb_path: Path) -> Tuple[str, dict, np.ndarray]: """ Relax a PDB file. @@ -192,7 +153,7 @@ def relax_protein(self, prot) -> Tuple[str, dict, np.ndarray]: Returns: Tuple of (relaxed_pdb_string, debug_info, violations) """ - use_gpu = check_gpu_available() + use_gpu = self._check_gpu_available() relaxer = AmberRelaxation( max_iterations=self.config.max_iterations, @@ -219,52 +180,40 @@ def relax_protein(self, prot) -> Tuple[str, dict, np.ndarray]: return relaxed_pdb, debug_data, violations def _relax_unconstrained( - self, pdb_string: str + self, pdb_string: str, ligand_coords: np.ndarray = None ) -> Tuple[str, dict, np.ndarray]: """ Bare-bones unconstrained OpenMM minimization. - No position restraints, no violation checking, uses OpenMM defaults. - This is the default minimization mode. - - Ligands are auto-detected. If present and ignore_ligands is False, - uses openmmforcefields for ligand parameterization. + No position restraints on protein, uses OpenMM defaults. + If ligand_coords is provided, adds fixed "dummy" particles at those + positions with LJ repulsion to prevent protein from clashing with + ligand positions. Args: pdb_string: PDB file contents as string (protein-only) + ligand_coords: Optional array of ligand atom positions (Angstroms) Returns: Tuple of (relaxed_pdb_string, debug_info, violations) """ - # Check if ligands are present (non-water HETATM records) - has_ligands = any( - line.startswith("HETATM") - and line[17:20].strip() not in WATER_RESIDUES - for line in pdb_string.split("\n") - ) - - # Auto-detect ligands and use openmmforcefields unless ignore_ligands - if has_ligands and not self.config.ignore_ligands: - return self._relax_unconstrained_with_ligands(pdb_string) - ENERGY = unit.kilocalories_per_mole LENGTH = unit.angstroms - use_gpu = check_gpu_available() + use_gpu = self._check_gpu_available() + has_ligand = ligand_coords is not None and len(ligand_coords) > 0 logger.info( f"Running unconstrained OpenMM minimization " - f"(max_iter={self.config.max_iterations}, gpu={use_gpu})" + f"(max_iter={self.config.max_iterations}, gpu={use_gpu}" + f"{', with ligand exclusion zone' if has_ligand else ''})" ) # Use pdbfixer to add missing atoms and terminal groups + from pdbfixer import PDBFixer + fixer = PDBFixer(pdbfile=io.StringIO(pdb_string)) fixer.findMissingResidues() - if not self.config.add_missing_residues: - fixer.missingResidues = {} # Clear to preserve original numbering - elif fixer.missingResidues: - n_missing = sum(len(v) for v in fixer.missingResidues.values()) - logger.info(f"Adding {n_missing} missing residues from SEQRES") fixer.findMissingAtoms() fixer.addMissingAtoms() @@ -282,6 +231,43 @@ def _relax_unconstrained( modeller.topology, constraints=openmm_app.HBonds ) + n_protein_atoms = system.getNumParticles() + + # Add ligand atoms as fixed dummy particles with LJ repulsion + ligand_particle_indices = [] + if has_ligand: + # Add a custom nonbonded force for ligand-protein repulsion + # Using soft-core LJ potential to prevent singularities + ligand_repulsion = openmm.CustomNonbondedForce( + "epsilon * (sigma/r)^12; " + "sigma=0.3; epsilon=4.0" # 3 Angstrom radius, 4 kJ/mol + ) + ligand_repulsion.setNonbondedMethod( + openmm.CustomNonbondedForce.CutoffNonPeriodic + ) + ligand_repulsion.setCutoffDistance(1.2 * unit.nanometers) + + # Add all protein atoms to the force + for _ in range(n_protein_atoms): + ligand_repulsion.addParticle([]) + + # Add ligand dummy particles to the system + for _ in ligand_coords: + # Add massless particle (won't move) + idx = system.addParticle(0.0) + ligand_particle_indices.append(idx) + ligand_repulsion.addParticle([]) + + # Set interaction groups: protein interacts with ligand dummies + protein_set = set(range(n_protein_atoms)) + ligand_set = set(ligand_particle_indices) + ligand_repulsion.addInteractionGroup(protein_set, ligand_set) + + system.addForce(ligand_repulsion) + logger.debug( + f"Added {len(ligand_coords)} ligand exclusion particles" + ) + # Create integrator and simulation integrator = openmm.LangevinIntegrator(0, 0.01, 0.0) platform = openmm.Platform.getPlatformByName( @@ -290,7 +276,18 @@ def _relax_unconstrained( simulation = openmm_app.Simulation( modeller.topology, system, integrator, platform ) - simulation.context.setPositions(modeller.positions) + + # Set positions: protein from modeller, ligand dummies from coords + positions = list(modeller.positions) + if has_ligand: + for coord in ligand_coords: + # Convert Angstroms to nanometers + positions.append( + openmm.Vec3(coord[0], coord[1], coord[2]) + * 0.1 + * unit.nanometers + ) + simulation.context.setPositions(positions) # Get initial energy state = simulation.context.getState(getEnergy=True, getPositions=True) @@ -308,13 +305,18 @@ def _relax_unconstrained( efinal = state.getPotentialEnergy().value_in_unit(ENERGY) pos = state.getPositions(asNumpy=True).value_in_unit(LENGTH) - # Calculate RMSD - rmsd = np.sqrt(np.sum((posinit - pos) ** 2) / len(posinit)) + # Calculate RMSD (protein atoms only) + rmsd = np.sqrt( + np.sum((posinit[:n_protein_atoms] - pos[:n_protein_atoms]) ** 2) + / n_protein_atoms + ) - # Write output PDB (keepIds=True preserves original residue numbering) + # Write output PDB (protein only - exclude dummy ligand particles) output = io.StringIO() + # Get only protein positions + protein_positions = state.getPositions()[:n_protein_atoms] openmm_app.PDBFile.writeFile( - simulation.topology, state.getPositions(), output, keepIds=True + modeller.topology, protein_positions, output ) relaxed_pdb = output.getvalue() @@ -324,6 +326,8 @@ def _relax_unconstrained( "rmsd": rmsd, "attempts": 1, } + if has_ligand: + debug_data["ligand_exclusion_atoms"] = len(ligand_coords) logger.info( f"Minimization complete: E_init={einit:.2f}, " @@ -335,121 +339,55 @@ def _relax_unconstrained( return relaxed_pdb, debug_data, violations - def _relax_unconstrained_with_ligands( - self, pdb_string: str - ) -> Tuple[str, dict, np.ndarray]: + def _relax_direct(self, pdb_string: str) -> Tuple[str, dict, np.ndarray]: """ - Unconstrained OpenMM minimization with ligand support. + Direct OpenMM minimization without pdbfixer. - Uses openmmforcefields to parameterize ligands with GAFF2/OpenFF. - The protein is processed separately with pdbfixer, then combined - with parameterized ligands for minimization. - - Requires openmmforcefields (install via conda-forge). + This is a simpler approach that works for already-complete structures + (like those from LigandMPNN with packed side chains). Args: pdb_string: PDB file contents as string Returns: Tuple of (relaxed_pdb_string, debug_info, violations) - - Raises: - ImportError: If openmmforcefields is not installed. """ - _check_openmmff() - ENERGY = unit.kilocalories_per_mole LENGTH = unit.angstroms - use_gpu = check_gpu_available() + use_gpu = self._check_gpu_available() logger.info( - f"Running unconstrained minimization with ligands " - f"(forcefield={self.config.ligand_forcefield}, gpu={use_gpu})" + f"Running direct OpenMM minimization " + f"(max_iter={self.config.max_iterations}, " + f"stiffness={self.config.stiffness}, gpu={use_gpu})" ) - # Step 1: Separate protein and ligands - protein_pdb, ligands = extract_ligands_from_pdb(pdb_string) - - # Check for unparameterizable cofactors and fail early - for lig in ligands: - if is_unparameterizable_cofactor(lig.resname): - raise ValueError( - f"Cannot parameterize cofactor '{lig.resname}' " - f"(chain {lig.chain_id}, res {lig.resnum}). " - f"Metallocofactors like heme, Fe-S clusters, and " - f"chlorophylls cannot be parameterized.\n\n" - f"Options:\n" - f" 1. Remove the cofactor from the input PDB\n" - f" 2. Use --ignore-ligands to exclude all ligands\n" - f" 3. Use --constrained-minimization (protein-only)" - ) - - ligand_names = [lig.resname for lig in ligands] - logger.info(f"Found {len(ligands)} ligand(s): {ligand_names}") + # Parse PDB + pdb_file = io.StringIO(pdb_string) + pdb = openmm_app.PDBFile(pdb_file) - # Step 2: Fix protein with pdbfixer (without ligands) - fixer = PDBFixer(pdbfile=io.StringIO(protein_pdb)) - fixer.findMissingResidues() - if not self.config.add_missing_residues: - fixer.missingResidues = {} # Clear to preserve original numbering - elif fixer.missingResidues: - n_missing = sum(len(v) for v in fixer.missingResidues.values()) - logger.info(f"Adding {n_missing} missing residues from SEQRES") - fixer.findMissingAtoms() - fixer.addMissingAtoms() - - # Step 3: Create OpenFF molecules for ligands - # User-provided SMILES override automatic detection - user_smiles = self.config.ligand_smiles or {} - - openff_molecules = [] - for ligand in ligands: - smiles = user_smiles.get(ligand.resname) - try: - mol = create_openff_molecule(ligand, smiles=smiles) - openff_molecules.append(mol) - logger.debug(f"Created OpenFF molecule for {ligand.resname}") - except Exception as e: - resname = ligand.resname - raise ValueError( - f"Could not parameterize ligand '{resname}' " - f"(chain {ligand.chain_id}, res {ligand.resnum}): {e}\n\n" - f"Options to resolve:\n" - f" 1. Provide SMILES via config: " - f"ligand_smiles={{'{resname}': 'SMILES_STRING'}}\n" - f" 2. Use constrained minimization: constrained=True\n" - f" 3. Remove the ligand from the input PDB\n\n" - f"For common ligands, see: " - f"https://www.ebi.ac.uk/pdbe-srv/pdbechem/" - ) + # Create force field and system + force_field = openmm_app.ForceField( + "amber14-all.xml", "amber14/tip3pfb.xml" + ) - # Step 4: Create combined topology using Modeller - modeller = openmm_app.Modeller(fixer.topology, fixer.positions) + # Use Modeller to add hydrogens (doesn't require pdbfixer) + modeller = openmm_app.Modeller(pdb.topology, pdb.positions) + modeller.addHydrogens(force_field) - # Add ligands back to modeller - for ligand in ligands: - ligand_topology, ligand_positions = ligand_pdb_to_topology(ligand) - modeller.add(ligand_topology, ligand_positions) - - # Step 5: Create SystemGenerator with ligand support - system_generator = SystemGenerator( - forcefields=["amber/ff14SB.xml"], - small_molecule_forcefield=self.config.ligand_forcefield, - molecules=openff_molecules, - forcefield_kwargs={ - "constraints": openmm_app.HBonds, - "removeCMMotion": True, - }, + # Create system with constraints on hydrogen bonds + system = force_field.createSystem( + modeller.topology, constraints=openmm_app.HBonds ) - # Add hydrogens - modeller.addHydrogens(system_generator.forcefield) - - # Create system - system = system_generator.create_system(modeller.topology) + # Add position restraints if stiffness > 0 + if self.config.stiffness > 0: + self._add_restraints( + system, modeller, self.config.stiffness * ENERGY / (LENGTH**2) + ) - # Step 6: Run minimization + # Create integrator and simulation integrator = openmm.LangevinIntegrator(0, 0.01, 0.0) platform = openmm.Platform.getPlatformByName( "CUDA" if use_gpu else "CPU" @@ -465,10 +403,13 @@ def _relax_unconstrained_with_ligands( posinit = state.getPositions(asNumpy=True).value_in_unit(LENGTH) # Minimize - if self.config.max_iterations > 0: - simulation.minimizeEnergy(maxIterations=self.config.max_iterations) - else: - simulation.minimizeEnergy() + # OpenMM minimizeEnergy tolerance is in kJ/mol/nm (gradient threshold) + tolerance = ( + self.config.tolerance * unit.kilojoules_per_mole / unit.nanometer + ) + simulation.minimizeEnergy( + maxIterations=self.config.max_iterations, tolerance=tolerance + ) # Get final state state = simulation.context.getState(getEnergy=True, getPositions=True) @@ -478,10 +419,10 @@ def _relax_unconstrained_with_ligands( # Calculate RMSD rmsd = np.sqrt(np.sum((posinit - pos) ** 2) / len(posinit)) - # Write output PDB (keepIds=True preserves original residue numbering) + # Write output PDB output = io.StringIO() openmm_app.PDBFile.writeFile( - simulation.topology, state.getPositions(), output, keepIds=True + simulation.topology, state.getPositions(), output ) relaxed_pdb = output.getvalue() @@ -490,16 +431,16 @@ def _relax_unconstrained_with_ligands( "final_energy": efinal, "rmsd": rmsd, "attempts": 1, - "ligands_included": [lig.resname for lig in ligands], - "ligand_forcefield": self.config.ligand_forcefield, } logger.info( - f"Minimization complete: E_init={einit:.2f}, " + f"Relaxation complete: E_init={einit:.2f}, " f"E_final={efinal:.2f}, RMSD={rmsd:.3f} A" ) + # No violations tracking in direct mode violations = np.zeros(0) + return relaxed_pdb, debug_data, violations def _add_restraints(self, system, modeller, stiffness): @@ -551,7 +492,7 @@ def get_energy_breakdown(self, pdb_string: str) -> dict: ) # Create simulation - use_gpu = check_gpu_available() + use_gpu = self._check_gpu_available() platform = openmm.Platform.getPlatformByName( "CUDA" if use_gpu else "CPU" ) diff --git a/tests/test_ligand_utils.py b/tests/test_ligand_utils.py index 5187941..d4c7b9d 100644 --- a/tests/test_ligand_utils.py +++ b/tests/test_ligand_utils.py @@ -1,9 +1,13 @@ """Unit tests for ligand utilities.""" +import pytest + from graphrelax.ligand_utils import ( + _PDBE_SMILES_CACHE, WATER_RESIDUES, LigandInfo, extract_ligands_from_pdb, + fetch_pdbe_smiles, get_ion_smiles, is_single_atom_ligand, ) @@ -181,3 +185,44 @@ def test_tip_models(self): assert "TIP3" in WATER_RESIDUES assert "TIP4" in WATER_RESIDUES assert "SPC" in WATER_RESIDUES + + +class TestFetchPdbeSmiles: + """Tests for PDBe SMILES fetching.""" + + def setup_method(self): + """Clear cache before each test.""" + _PDBE_SMILES_CACHE.clear() + + @pytest.mark.network + def test_fetch_known_ligand(self): + """Test fetching SMILES for a known PDB ligand.""" + # ATP is a well-known ligand + smiles = fetch_pdbe_smiles("ATP") + assert smiles is not None + assert len(smiles) > 0 + # Cache should be populated + assert "ATP" in _PDBE_SMILES_CACHE + + @pytest.mark.network + def test_fetch_unknown_ligand(self): + """Test fetching SMILES for a non-existent ligand.""" + smiles = fetch_pdbe_smiles("ZZZZZ") + assert smiles is None + # Failure should also be cached + assert "ZZZZZ" in _PDBE_SMILES_CACHE + assert _PDBE_SMILES_CACHE["ZZZZZ"] is None + + def test_cache_hit(self): + """Test that cached SMILES are returned.""" + # Pre-populate cache + _PDBE_SMILES_CACHE["CACHED"] = "CCC" + smiles = fetch_pdbe_smiles("CACHED") + assert smiles == "CCC" + + def test_case_insensitive(self): + """Test that lookup is case-insensitive.""" + _PDBE_SMILES_CACHE["TEST"] = "C=O" + # Lowercase should find uppercase cache entry + smiles = fetch_pdbe_smiles("test") + assert smiles == "C=O"