From 7fcac8288791084b196d8b972658f18022799f6d Mon Sep 17 00:00:00 2001
From: delalamo <diego.delalamo@gmail.com>
Date: Mon, 12 Jan 2026 09:08:33 -0500
Subject: [PATCH 01/22] Add Docker support with GitHub Container Registry
 publishing

- Add Dockerfile using condaforge/miniforge3 for conda dependencies
- Add docker-build.yml workflow triggered by releases, tags, or manual dispatch
- Add .dockerignore to exclude build artifacts
- Update README with Docker usage instructions

Image published to ghcr.io/delalamo/graphrelax

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 .dockerignore                      | 14 ++++++
 .github/workflows/docker-build.yml | 70 ++++++++++++++++++++++++++++++
 Dockerfile                         | 38 ++++++++++++++++
 README.md                          | 24 ++++++++++
 4 files changed, 146 insertions(+)
 create mode 100644 .dockerignore
 create mode 100644 .github/workflows/docker-build.yml
 create mode 100644 Dockerfile

diff --git a/.dockerignore b/.dockerignore
new file mode 100644
index 0000000..d308aa8
--- /dev/null
+++ b/.dockerignore
@@ -0,0 +1,14 @@
+.git
+__pycache__
+*.py[cod]
+*.egg-info/
+dist/
+build/
+.pytest_cache/
+.coverage
+.venv/
+venv/
+.idea/
+.vscode/
+src/graphrelax/LigandMPNN/model_params/*.pt
+.DS_Store
diff --git a/.github/workflows/docker-build.yml b/.github/workflows/docker-build.yml
new file mode 100644
index 0000000..f4006cb
--- /dev/null
+++ b/.github/workflows/docker-build.yml
@@ -0,0 +1,70 @@
+name: Docker Build
+
+on:
+  release:
+    types: [published]
+  push:
+    tags:
+      - "v*"
+  workflow_dispatch:
+    inputs:
+      tag:
+        description: "Image tag (optional, defaults to 'dev')"
+        required: false
+        default: "dev"
+
+env:
+  REGISTRY: ghcr.io
+  IMAGE_NAME: ${{ github.repository }}
+
+jobs:
+  build-and-push:
+    runs-on: ubuntu-latest
+    permissions:
+      contents: read
+      packages: write
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+
+      - name: Compute image tags
+        id: tags
+        run: |
+          REPO_LOWER=$(echo "${{ github.repository }}" | tr '[:upper:]' '[:lower:]')
+          if [[ "${{ github.event_name }}" == "release" ]]; then
+            VERSION="${{ github.event.release.tag_name }}"
+            VERSION="${VERSION#v}"
+            echo "tags=${{ env.REGISTRY }}/${REPO_LOWER}:${VERSION},${{ env.REGISTRY }}/${REPO_LOWER}:latest" >> $GITHUB_OUTPUT
+          elif [[ "${{ github.ref }}" == refs/tags/v* ]]; then
+            VERSION="${GITHUB_REF#refs/tags/v}"
+            echo "tags=${{ env.REGISTRY }}/${REPO_LOWER}:${VERSION}" >> $GITHUB_OUTPUT
+          else
+            TAG="${{ github.event.inputs.tag || 'dev' }}"
+            echo "tags=${{ env.REGISTRY }}/${REPO_LOWER}:${TAG}" >> $GITHUB_OUTPUT
+          fi
+
+      - name: Log in to Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: ${{ env.REGISTRY }}
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Build and push Docker image
+        uses: docker/build-push-action@v5
+        with:
+          context: .
+          push: true
+          tags: ${{ steps.tags.outputs.tags }}
+          cache-from: type=gha
+          cache-to: type=gha,mode=max
+
+      - name: Verify image
+        run: |
+          REPO_LOWER=$(echo "${{ github.repository }}" | tr '[:upper:]' '[:lower:]')
+          TAG="${{ github.event.inputs.tag || 'dev' }}"
+          docker run --rm ${{ env.REGISTRY }}/${REPO_LOWER}:${TAG} --help
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000..5d60f2d
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,38 @@
+# GraphRelax Docker Image
+# Combines LigandMPNN sequence design with OpenMM AMBER relaxation
+#
+# Build: docker build -t graphrelax .
+# Run:   docker run --rm -v $(pwd):/data graphrelax -i /data/input.pdb -o /data/output.pdb
+
+FROM condaforge/miniforge3:latest
+
+LABEL org.opencontainers.image.source="https://github.com/delalamo/GraphRelax"
+LABEL org.opencontainers.image.description="GraphRelax: LigandMPNN + OpenMM AMBER relaxation"
+LABEL org.opencontainers.image.licenses="MIT"
+
+WORKDIR /app
+
+# Install system dependencies
+RUN apt-get update && apt-get install -y --no-install-recommends curl \
+    && rm -rf /var/lib/apt/lists/*
+
+# Copy package files
+COPY pyproject.toml README.md LICENSE ./
+COPY src/ ./src/
+COPY scripts/ ./scripts/
+
+# Install conda dependencies and package
+# Note: openmm and pdbfixer are only available on conda-forge, not PyPI
+RUN mamba install -y -c conda-forge python=3.11 openmm pdbfixer pytorch-cpu \
+    && mamba clean -afy \
+    && pip install --no-cache-dir -e .
+
+# Download model weights (~40MB total)
+RUN ./scripts/download_weights.sh
+
+# Create non-root user for security
+RUN useradd -m graphrelax && chown -R graphrelax:graphrelax /app
+USER graphrelax
+
+ENTRYPOINT ["graphrelax"]
+CMD ["--help"]
diff --git a/README.md b/README.md
index 455bc8a..841d777 100644
--- a/README.md
+++ b/README.md
@@ -40,6 +40,30 @@ pip install -e ".[cuda11]"
 pip install -e ".[cuda12]"
 ```
 
+### Docker
+
+Run GraphRelax using Docker without installing dependencies:
+
+```bash
+# Pull the image
+docker pull ghcr.io/delalamo/graphrelax:latest
+
+# Run with input/output files
+docker run --rm -v $(pwd):/data ghcr.io/delalamo/graphrelax:latest \
+    -i /data/input.pdb -o /data/output.pdb
+
+# Design mode with 5 outputs
+docker run --rm -v $(pwd):/data ghcr.io/delalamo/graphrelax:latest \
+    -i /data/input.pdb -o /data/designed.pdb --design -n 5
+```
+
+Build locally:
+
+```bash
+docker build -t graphrelax .
+docker run --rm graphrelax --help
+```
+
 ### Dependencies
 
 Core dependencies (installed automatically via pip):

From 73616967cfbff0c1604e16e1c4f3e482b68a9f56 Mon Sep 17 00:00:00 2001
From: delalamo <diego.delalamo@gmail.com>
Date: Mon, 12 Jan 2026 09:10:40 -0500
Subject: [PATCH 02/22] Adding .DS_Store to .gitignore

---
 .gitignore | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.gitignore b/.gitignore
index d93d2a4..cc697a2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -212,3 +212,6 @@ src/graphrelax/LigandMPNN/model_params/*.pt
 
 # Auto-generated version file (setuptools-scm)
 src/graphrelax/_version.py
+
+# Misc
+.DS_Store

From 455402d282e4a57e3ae70da4232951d2542da870 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Mon, 12 Jan 2026 14:54:53 +0000
Subject: [PATCH 03/22] Add chain gap detection to prevent artificial gap
 closure during minimization

Detects missing residues in protein chains by checking residue numbering
discontinuities and C-N bond distances. Chains are split at gaps before
OpenMM minimization to prevent the creation of unrealistic peptide bonds
across gaps. Original chain IDs are restored after minimization.

- Add chain_gaps.py module with detect_chain_gaps, split_chains_at_gaps,
  and restore_chain_ids functions
- Integrate gap detection into relaxer.py relax() method
- Add split_chains_at_gaps config option (enabled by default)
- Add --no-split-gaps CLI flag to disable the feature
- Add comprehensive tests for chain gap detection
---
 src/graphrelax/chain_gaps.py | 351 +++++++++++++++++++++++++++++++++++
 src/graphrelax/cli.py        |  10 +
 src/graphrelax/config.py     |   1 +
 src/graphrelax/relaxer.py    |  35 +++-
 tests/test_chain_gaps.py     | 344 ++++++++++++++++++++++++++++++++++
 5 files changed, 739 insertions(+), 2 deletions(-)
 create mode 100644 src/graphrelax/chain_gaps.py
 create mode 100644 tests/test_chain_gaps.py

diff --git a/src/graphrelax/chain_gaps.py b/src/graphrelax/chain_gaps.py
new file mode 100644
index 0000000..49b15ae
--- /dev/null
+++ b/src/graphrelax/chain_gaps.py
@@ -0,0 +1,351 @@
+"""Detection and handling of chain gaps in protein structures.
+
+This module provides functions to detect missing residues (gaps) in protein
+chains and to split chains at those gaps prior to minimization. This prevents
+OpenMM minimization from artificially closing gaps by creating unrealistic
+peptide bonds.
+"""
+
+import io
+import logging
+from dataclasses import dataclass
+from typing import Dict, List, Optional, Tuple
+
+from Bio.PDB import PDBParser
+
+logger = logging.getLogger(__name__)
+
+# Maximum C-N peptide bond distance in Angstroms
+# Typical C-N bond is ~1.33 A, anything > 2.0 A indicates a break
+MAX_PEPTIDE_BOND_DISTANCE = 2.0
+
+# Maximum residue number gap that's considered sequential
+# Gap > 1 indicates missing residues
+MAX_SEQUENTIAL_GAP = 1
+
+
+@dataclass
+class ChainGap:
+    """Represents a gap in a protein chain."""
+
+    chain_id: str
+    residue_before: int  # Residue number before the gap
+    residue_after: int  # Residue number after the gap
+    distance: Optional[float] = None  # C-N distance if available
+    icode_before: str = ""
+    icode_after: str = ""
+
+    def __str__(self) -> str:
+        gap_size = self.residue_after - self.residue_before - 1
+        dist_str = f", dist={self.distance:.2f}A" if self.distance else ""
+        return (
+            f"Chain {self.chain_id}: gap of {gap_size} residues "
+            f"between {self.residue_before}{self.icode_before} and "
+            f"{self.residue_after}{self.icode_after}{dist_str}"
+        )
+
+
+def detect_chain_gaps(
+    pdb_string: str,
+    check_distance: bool = True,
+    max_bond_distance: float = MAX_PEPTIDE_BOND_DISTANCE,
+) -> List[ChainGap]:
+    """
+    Detect gaps in protein chains.
+
+    Gaps are detected by two methods:
+    1. Residue numbering discontinuities (missing residue numbers)
+    2. Large C-N distances between consecutive residues (if check_distance=True)
+
+    Args:
+        pdb_string: PDB file contents as string
+        check_distance: Whether to also check C-N bond distances
+        max_bond_distance: Maximum allowed C-N distance in Angstroms
+
+    Returns:
+        List of ChainGap objects describing each detected gap
+    """
+    parser = PDBParser(QUIET=True)
+    structure = parser.get_structure("protein", io.StringIO(pdb_string))
+
+    gaps = []
+
+    for model in structure:
+        for chain in model:
+            residues = [r for r in chain.get_residues() if r.id[0] == " "]
+            if len(residues) < 2:
+                continue
+
+            for i in range(len(residues) - 1):
+                res1 = residues[i]
+                res2 = residues[i + 1]
+
+                resnum1 = res1.id[1]
+                resnum2 = res2.id[1]
+                icode1 = res1.id[2].strip()
+                icode2 = res2.id[2].strip()
+
+                # Check for numbering gap
+                numbering_gap = resnum2 - resnum1 > MAX_SEQUENTIAL_GAP
+
+                # Check C-N distance if requested
+                distance = None
+                distance_gap = False
+                if check_distance:
+                    try:
+                        c_atom = res1["C"]
+                        n_atom = res2["N"]
+                        distance = c_atom - n_atom
+                        distance_gap = distance > max_bond_distance
+                    except KeyError:
+                        # Missing backbone atoms - treat as potential gap
+                        distance_gap = True
+
+                if numbering_gap or distance_gap:
+                    gap = ChainGap(
+                        chain_id=chain.id,
+                        residue_before=resnum1,
+                        residue_after=resnum2,
+                        distance=distance,
+                        icode_before=icode1,
+                        icode_after=icode2,
+                    )
+                    gaps.append(gap)
+                    logger.debug(f"Detected gap: {gap}")
+
+    if gaps:
+        logger.info(f"Detected {len(gaps)} chain gap(s) in structure")
+    else:
+        logger.debug("No chain gaps detected")
+
+    return gaps
+
+
+def split_chains_at_gaps(
+    pdb_string: str,
+    gaps: Optional[List[ChainGap]] = None,
+) -> Tuple[str, Dict[str, str]]:
+    """
+    Split chains at detected gaps by assigning new chain IDs.
+
+    Each continuous segment gets a unique chain ID. This prevents OpenMM
+    from creating peptide bonds across gaps during minimization.
+
+    Args:
+        pdb_string: PDB file contents as string
+        gaps: Pre-detected gaps (if None, will detect automatically)
+
+    Returns:
+        Tuple of:
+        - Modified PDB string with split chains
+        - Mapping from new chain IDs to original chain IDs
+    """
+    if gaps is None:
+        gaps = detect_chain_gaps(pdb_string)
+
+    if not gaps:
+        return pdb_string, {}
+
+    # Build a set of (chain_id, residue_after) for gap locations
+    # We'll start a new chain at each gap
+    gap_starts = {(g.chain_id, g.residue_after, g.icode_after) for g in gaps}
+
+    # Available chain IDs (A-Z, a-z, 0-9)
+    all_chain_ids = (
+        list("ABCDEFGHIJKLMNOPQRSTUVWXYZ")
+        + list("abcdefghijklmnopqrstuvwxyz")
+        + list("0123456789")
+    )
+
+    # Parse to find existing chain IDs
+    existing_chains = set()
+    for line in pdb_string.split("\n"):
+        if line.startswith(("ATOM", "HETATM")) and len(line) > 21:
+            existing_chains.add(line[21])
+
+    # Create pool of available new chain IDs
+    available_ids = [c for c in all_chain_ids if c not in existing_chains]
+
+    # Track chain ID assignments
+    chain_mapping = {}  # new_chain_id -> original_chain_id
+    current_chain_map = {}  # original_chain_id -> current_new_chain_id
+    next_id_idx = 0
+
+    # Process PDB line by line
+    output_lines = []
+    current_residue = {}  # chain_id -> (resnum, icode)
+
+    for line in pdb_string.split("\n"):
+        if line.startswith(("ATOM", "HETATM")) and len(line) > 26:
+            chain_id = line[21]
+            resnum = int(line[22:26].strip())
+            icode = line[26].strip() if len(line) > 26 else ""
+
+            # Check if this residue starts a new segment (at a gap)
+            if (chain_id, resnum, icode) in gap_starts:
+                # Assign a new chain ID for this segment
+                if next_id_idx < len(available_ids):
+                    new_chain_id = available_ids[next_id_idx]
+                    next_id_idx += 1
+                    current_chain_map[chain_id] = new_chain_id
+                    chain_mapping[new_chain_id] = chain_id
+                    logger.debug(
+                        f"Assigned new chain {new_chain_id} for segment "
+                        f"starting at {chain_id}:{resnum}{icode}"
+                    )
+                else:
+                    logger.warning(
+                        "Ran out of available chain IDs for gap splitting"
+                    )
+
+            # Initialize chain mapping if not seen before
+            if chain_id not in current_chain_map:
+                current_chain_map[chain_id] = chain_id
+                # Original chains map to themselves
+                if chain_id not in chain_mapping:
+                    chain_mapping[chain_id] = chain_id
+
+            # Replace chain ID in line
+            new_chain_id = current_chain_map[chain_id]
+            line = line[:21] + new_chain_id + line[22:]
+
+            current_residue[chain_id] = (resnum, icode)
+
+        elif line.startswith("TER") and len(line) > 21:
+            # Update TER record chain ID too
+            chain_id = line[21]
+            if chain_id in current_chain_map:
+                new_chain_id = current_chain_map[chain_id]
+                line = line[:21] + new_chain_id + line[22:]
+
+        output_lines.append(line)
+
+    result = "\n".join(output_lines)
+
+    if chain_mapping:
+        # Only log non-trivial mappings
+        non_trivial = {k: v for k, v in chain_mapping.items() if k != v}
+        if non_trivial:
+            logger.info(
+                f"Split chains at {len(gaps)} gap(s), "
+                f"created {len(non_trivial)} new chain segment(s)"
+            )
+
+    return result, chain_mapping
+
+
+def restore_chain_ids(
+    pdb_string: str,
+    chain_mapping: Dict[str, str],
+) -> str:
+    """
+    Restore original chain IDs after minimization.
+
+    Args:
+        pdb_string: PDB string with split chains
+        chain_mapping: Mapping from current chain IDs to original chain IDs
+
+    Returns:
+        PDB string with original chain IDs restored
+    """
+    if not chain_mapping:
+        return pdb_string
+
+    # Build reverse mapping (new -> original)
+    # Note: chain_mapping is new_id -> original_id
+    output_lines = []
+
+    for line in pdb_string.split("\n"):
+        if line.startswith(("ATOM", "HETATM")) and len(line) > 21:
+            current_chain = line[21]
+            if current_chain in chain_mapping:
+                original_chain = chain_mapping[current_chain]
+                line = line[:21] + original_chain + line[22:]
+
+        elif line.startswith("TER") and len(line) > 21:
+            current_chain = line[21]
+            if current_chain in chain_mapping:
+                original_chain = chain_mapping[current_chain]
+                line = line[:21] + original_chain + line[22:]
+
+        output_lines.append(line)
+
+    logger.debug("Restored original chain IDs")
+    return "\n".join(output_lines)
+
+
+def add_ter_records_at_gaps(pdb_string: str, gaps: List[ChainGap]) -> str:
+    """
+    Add TER records at gap locations to signal chain breaks.
+
+    This is an alternative to chain splitting that works with force fields
+    that recognize TER as chain termination.
+
+    Args:
+        pdb_string: PDB file contents as string
+        gaps: List of detected gaps
+
+    Returns:
+        PDB string with TER records inserted at gap locations
+    """
+    if not gaps:
+        return pdb_string
+
+    # Build set of (chain_id, resnum, icode) where TER should be inserted
+    # TER goes after residue_before
+    ter_locations = {
+        (g.chain_id, g.residue_before, g.icode_before) for g in gaps
+    }
+
+    output_lines = []
+    prev_chain = None
+    prev_resnum = None
+    prev_icode = None
+
+    for line in pdb_string.split("\n"):
+        if line.startswith(("ATOM", "HETATM")) and len(line) > 26:
+            chain_id = line[21]
+            resnum = int(line[22:26].strip())
+            icode = line[26].strip() if len(line) > 26 else ""
+
+            # Check if we need to insert TER before this line
+            # (i.e., previous residue was at a gap location)
+            if (
+                prev_chain is not None
+                and (prev_chain, prev_resnum, prev_icode) in ter_locations
+                and (chain_id != prev_chain or resnum != prev_resnum)
+            ):
+                # Insert TER record
+                ter_line = "TER   "
+                output_lines.append(ter_line)
+                logger.debug(
+                    f"Inserted TER after {prev_chain}:{prev_resnum}{prev_icode}"
+                )
+
+            prev_chain = chain_id
+            prev_resnum = resnum
+            prev_icode = icode
+
+        output_lines.append(line)
+
+    return "\n".join(output_lines)
+
+
+def get_gap_summary(gaps: List[ChainGap]) -> str:
+    """
+    Generate a human-readable summary of detected gaps.
+
+    Args:
+        gaps: List of detected gaps
+
+    Returns:
+        Formatted string describing all gaps
+    """
+    if not gaps:
+        return "No chain gaps detected"
+
+    lines = [f"Detected {len(gaps)} chain gap(s):"]
+    for gap in gaps:
+        lines.append(f"  - {gap}")
+
+    return "\n".join(lines)
diff --git a/src/graphrelax/cli.py b/src/graphrelax/cli.py
index 0d33e02..b2c0d29 100644
--- a/src/graphrelax/cli.py
+++ b/src/graphrelax/cli.py
@@ -205,6 +205,15 @@ def create_parser() -> argparse.ArgumentParser:
         metavar="N",
         help="Max L-BFGS iterations, 0=unlimited (default: 0)",
     )
+    relax_group.add_argument(
+        "--no-split-gaps",
+        action="store_true",
+        help=(
+            "Disable automatic chain splitting at gaps. "
+            "By default, chains are split at detected gaps (missing residues) "
+            "to prevent artificial gap closure during minimization."
+        ),
+    )
 
     # Scoring options
     score_group = parser.add_argument_group("Scoring options")
@@ -323,6 +332,7 @@ def main(args=None) -> int:
         stiffness=opts.stiffness,
         max_iterations=opts.max_iterations,
         constrained=opts.constrained_minimization,
+        split_chains_at_gaps=not opts.no_split_gaps,
     )
 
     pipeline_config = PipelineConfig(
diff --git a/src/graphrelax/config.py b/src/graphrelax/config.py
index ed69714..7f55c99 100644
--- a/src/graphrelax/config.py
+++ b/src/graphrelax/config.py
@@ -41,6 +41,7 @@ class RelaxConfig:
     stiffness: float = 10.0  # kcal/mol/A^2
     max_outer_iterations: int = 3  # Violation-fixing iterations
     constrained: bool = False  # Use constrained (AmberRelaxation) minimization
+    split_chains_at_gaps: bool = True  # Split chains at gaps to prevent closure
     # GPU is auto-detected and used when available
 
 
diff --git a/src/graphrelax/relaxer.py b/src/graphrelax/relaxer.py
index fefc176..de351a5 100644
--- a/src/graphrelax/relaxer.py
+++ b/src/graphrelax/relaxer.py
@@ -11,6 +11,12 @@
 from openmm import app as openmm_app
 from openmm import openmm, unit
 
+from graphrelax.chain_gaps import (
+    detect_chain_gaps,
+    get_gap_summary,
+    restore_chain_ids,
+    split_chains_at_gaps,
+)
 from graphrelax.config import RelaxConfig
 
 # Add vendored LigandMPNN to path for OpenFold imports
@@ -54,17 +60,42 @@ def relax(self, pdb_string: str) -> Tuple[str, dict, np.ndarray]:
         Uses unconstrained minimization by default, or constrained
         AmberRelaxation if config.constrained is True.
 
+        If split_chains_at_gaps is enabled, chains will be split at detected
+        gaps before minimization to prevent artificial gap closure.
+
         Args:
             pdb_string: PDB file contents as string
 
         Returns:
             Tuple of (relaxed_pdb_string, debug_info, violations)
         """
+        # Detect and handle chain gaps if configured
+        chain_mapping = {}
+        if self.config.split_chains_at_gaps:
+            gaps = detect_chain_gaps(pdb_string)
+            if gaps:
+                logger.info(get_gap_summary(gaps))
+                pdb_string, chain_mapping = split_chains_at_gaps(
+                    pdb_string, gaps
+                )
+
         if self.config.constrained:
             prot = protein.from_pdb_string(pdb_string)
-            return self.relax_protein(prot)
+            relaxed_pdb, debug_info, violations = self.relax_protein(prot)
         else:
-            return self._relax_unconstrained(pdb_string)
+            relaxed_pdb, debug_info, violations = self._relax_unconstrained(
+                pdb_string
+            )
+
+        # Restore original chain IDs if chains were split
+        if chain_mapping:
+            relaxed_pdb = restore_chain_ids(relaxed_pdb, chain_mapping)
+            debug_info["chains_split"] = True
+            debug_info["gaps_detected"] = len(
+                [k for k, v in chain_mapping.items() if k != v]
+            )
+
+        return relaxed_pdb, debug_info, violations
 
     def relax_pdb_file(self, pdb_path: Path) -> Tuple[str, dict, np.ndarray]:
         """
diff --git a/tests/test_chain_gaps.py b/tests/test_chain_gaps.py
new file mode 100644
index 0000000..53b29d7
--- /dev/null
+++ b/tests/test_chain_gaps.py
@@ -0,0 +1,344 @@
+"""Tests for graphrelax.chain_gaps module."""
+
+import pytest
+
+from graphrelax.chain_gaps import (
+    ChainGap,
+    add_ter_records_at_gaps,
+    detect_chain_gaps,
+    get_gap_summary,
+    restore_chain_ids,
+    split_chains_at_gaps,
+)
+
+
+@pytest.fixture
+def continuous_peptide_pdb():
+    """A continuous peptide with no gaps (residues 1-5)."""
+    # fmt: off
+    return """ATOM      1  N   ALA A   1       0.000   0.000   0.000  1.00  0.00           N
+ATOM      2  CA  ALA A   1       1.458   0.000   0.000  1.00  0.00           C
+ATOM      3  C   ALA A   1       2.009   1.420   0.000  1.00  0.00           C
+ATOM      4  O   ALA A   1       1.246   2.390   0.000  1.00  0.00           O
+ATOM      5  N   ALA A   2       3.326   1.540   0.000  1.00  0.00           N
+ATOM      6  CA  ALA A   2       3.941   2.861   0.000  1.00  0.00           C
+ATOM      7  C   ALA A   2       5.459   2.789   0.000  1.00  0.00           C
+ATOM      8  O   ALA A   2       6.065   1.719   0.000  1.00  0.00           O
+ATOM      9  N   ALA A   3       6.063   3.970   0.000  1.00  0.00           N
+ATOM     10  CA  ALA A   3       7.510   4.096   0.000  1.00  0.00           C
+ATOM     11  C   ALA A   3       8.061   5.516   0.000  1.00  0.00           C
+ATOM     12  O   ALA A   3       7.298   6.486   0.000  1.00  0.00           O
+ATOM     13  N   ALA A   4       9.378   5.636   0.000  1.00  0.00           N
+ATOM     14  CA  ALA A   4       9.993   6.957   0.000  1.00  0.00           C
+ATOM     15  C   ALA A   4      11.511   6.885   0.000  1.00  0.00           C
+ATOM     16  O   ALA A   4      12.117   5.815   0.000  1.00  0.00           O
+ATOM     17  N   ALA A   5      12.115   8.066   0.000  1.00  0.00           N
+ATOM     18  CA  ALA A   5      13.562   8.192   0.000  1.00  0.00           C
+ATOM     19  C   ALA A   5      14.113   9.612   0.000  1.00  0.00           C
+ATOM     20  O   ALA A   5      13.350  10.582   0.000  1.00  0.00           O
+END
+"""  # noqa: E501
+    # fmt: on
+
+
+@pytest.fixture
+def gapped_peptide_pdb():
+    """A peptide with a gap - residues 1, 2, 5, 6, 7 (missing 3-4)."""
+    # fmt: off
+    return """ATOM      1  N   ALA A   1       0.000   0.000   0.000  1.00  0.00           N
+ATOM      2  CA  ALA A   1       1.458   0.000   0.000  1.00  0.00           C
+ATOM      3  C   ALA A   1       2.009   1.420   0.000  1.00  0.00           C
+ATOM      4  O   ALA A   1       1.246   2.390   0.000  1.00  0.00           O
+ATOM      5  N   ALA A   2       3.326   1.540   0.000  1.00  0.00           N
+ATOM      6  CA  ALA A   2       3.941   2.861   0.000  1.00  0.00           C
+ATOM      7  C   ALA A   2       5.459   2.789   0.000  1.00  0.00           C
+ATOM      8  O   ALA A   2       6.065   1.719   0.000  1.00  0.00           O
+ATOM      9  N   ALA A   5      12.115   8.066   0.000  1.00  0.00           N
+ATOM     10  CA  ALA A   5      13.562   8.192   0.000  1.00  0.00           C
+ATOM     11  C   ALA A   5      14.113   9.612   0.000  1.00  0.00           C
+ATOM     12  O   ALA A   5      13.350  10.582   0.000  1.00  0.00           O
+ATOM     13  N   ALA A   6      15.115  10.066   0.000  1.00  0.00           N
+ATOM     14  CA  ALA A   6      16.562  10.192   0.000  1.00  0.00           C
+ATOM     15  C   ALA A   6      17.113  11.612   0.000  1.00  0.00           C
+ATOM     16  O   ALA A   6      16.350  12.582   0.000  1.00  0.00           O
+ATOM     17  N   ALA A   7      18.115  12.066   0.000  1.00  0.00           N
+ATOM     18  CA  ALA A   7      19.562  12.192   0.000  1.00  0.00           C
+ATOM     19  C   ALA A   7      20.113  13.612   0.000  1.00  0.00           C
+ATOM     20  O   ALA A   7      19.350  14.582   0.000  1.00  0.00           O
+END
+"""  # noqa: E501
+    # fmt: on
+
+
+@pytest.fixture
+def multi_gap_pdb():
+    """A peptide with multiple gaps - residues 1, 5, 10."""
+    # fmt: off
+    return """ATOM      1  N   ALA A   1       0.000   0.000   0.000  1.00  0.00           N
+ATOM      2  CA  ALA A   1       1.458   0.000   0.000  1.00  0.00           C
+ATOM      3  C   ALA A   1       2.009   1.420   0.000  1.00  0.00           C
+ATOM      4  O   ALA A   1       1.246   2.390   0.000  1.00  0.00           O
+ATOM      5  N   ALA A   5      12.115   8.066   0.000  1.00  0.00           N
+ATOM      6  CA  ALA A   5      13.562   8.192   0.000  1.00  0.00           C
+ATOM      7  C   ALA A   5      14.113   9.612   0.000  1.00  0.00           C
+ATOM      8  O   ALA A   5      13.350  10.582   0.000  1.00  0.00           O
+ATOM      9  N   ALA A  10      22.115  18.066   0.000  1.00  0.00           N
+ATOM     10  CA  ALA A  10      23.562  18.192   0.000  1.00  0.00           C
+ATOM     11  C   ALA A  10      24.113  19.612   0.000  1.00  0.00           C
+ATOM     12  O   ALA A  10      23.350  20.582   0.000  1.00  0.00           O
+END
+"""  # noqa: E501
+    # fmt: on
+
+
+@pytest.fixture
+def multi_chain_gapped_pdb():
+    """A multi-chain structure with a gap in chain A and continuous chain B."""
+    # fmt: off
+    return """ATOM      1  N   ALA A   1       0.000   0.000   0.000  1.00  0.00           N
+ATOM      2  CA  ALA A   1       1.458   0.000   0.000  1.00  0.00           C
+ATOM      3  C   ALA A   1       2.009   1.420   0.000  1.00  0.00           C
+ATOM      4  O   ALA A   1       1.246   2.390   0.000  1.00  0.00           O
+ATOM      5  N   ALA A   5      12.115   8.066   0.000  1.00  0.00           N
+ATOM      6  CA  ALA A   5      13.562   8.192   0.000  1.00  0.00           C
+ATOM      7  C   ALA A   5      14.113   9.612   0.000  1.00  0.00           C
+ATOM      8  O   ALA A   5      13.350  10.582   0.000  1.00  0.00           O
+TER
+ATOM      9  N   ALA B   1      30.000   0.000   0.000  1.00  0.00           N
+ATOM     10  CA  ALA B   1      31.458   0.000   0.000  1.00  0.00           C
+ATOM     11  C   ALA B   1      32.009   1.420   0.000  1.00  0.00           C
+ATOM     12  O   ALA B   1      31.246   2.390   0.000  1.00  0.00           O
+ATOM     13  N   ALA B   2      33.326   1.540   0.000  1.00  0.00           N
+ATOM     14  CA  ALA B   2      33.941   2.861   0.000  1.00  0.00           C
+ATOM     15  C   ALA B   2      35.459   2.789   0.000  1.00  0.00           C
+ATOM     16  O   ALA B   2      36.065   1.719   0.000  1.00  0.00           O
+END
+"""  # noqa: E501
+    # fmt: on
+
+
+class TestDetectChainGaps:
+    """Tests for detect_chain_gaps function."""
+
+    def test_no_gaps_in_continuous_peptide(self, continuous_peptide_pdb):
+        """Test that no gaps are detected in a continuous peptide."""
+        gaps = detect_chain_gaps(continuous_peptide_pdb, check_distance=False)
+        assert len(gaps) == 0
+
+    def test_detects_single_gap(self, gapped_peptide_pdb):
+        """Test detection of a single gap."""
+        gaps = detect_chain_gaps(gapped_peptide_pdb, check_distance=False)
+        assert len(gaps) == 1
+        assert gaps[0].chain_id == "A"
+        assert gaps[0].residue_before == 2
+        assert gaps[0].residue_after == 5
+
+    def test_detects_multiple_gaps(self, multi_gap_pdb):
+        """Test detection of multiple gaps."""
+        gaps = detect_chain_gaps(multi_gap_pdb, check_distance=False)
+        assert len(gaps) == 2
+
+        # First gap: 1 -> 5
+        assert gaps[0].chain_id == "A"
+        assert gaps[0].residue_before == 1
+        assert gaps[0].residue_after == 5
+
+        # Second gap: 5 -> 10
+        assert gaps[1].chain_id == "A"
+        assert gaps[1].residue_before == 5
+        assert gaps[1].residue_after == 10
+
+    def test_detects_gap_in_specific_chain(self, multi_chain_gapped_pdb):
+        """Test that gaps are correctly attributed to their chains."""
+        gaps = detect_chain_gaps(multi_chain_gapped_pdb, check_distance=False)
+
+        # Only chain A has a gap
+        assert len(gaps) == 1
+        assert gaps[0].chain_id == "A"
+
+    def test_distance_check_finds_large_cn_distance(self, gapped_peptide_pdb):
+        """Test that distance checking finds large C-N distances."""
+        gaps = detect_chain_gaps(gapped_peptide_pdb, check_distance=True)
+
+        # Should find the gap
+        assert len(gaps) >= 1
+        # The gap should have a distance measurement
+        gap = gaps[0]
+        assert gap.distance is None or gap.distance > 2.0
+
+
+class TestChainGap:
+    """Tests for ChainGap dataclass."""
+
+    def test_str_representation(self):
+        """Test string representation of ChainGap."""
+        gap = ChainGap(
+            chain_id="A",
+            residue_before=10,
+            residue_after=15,
+            distance=5.5,
+        )
+        s = str(gap)
+        assert "A" in s
+        assert "4 residues" in s  # 15 - 10 - 1 = 4 missing
+        assert "10" in s
+        assert "15" in s
+        assert "5.50" in s
+
+    def test_str_without_distance(self):
+        """Test string representation without distance."""
+        gap = ChainGap(
+            chain_id="B",
+            residue_before=1,
+            residue_after=5,
+        )
+        s = str(gap)
+        assert "B" in s
+        assert "3 residues" in s
+        assert "dist=" not in s
+
+
+class TestSplitChainsAtGaps:
+    """Tests for split_chains_at_gaps function."""
+
+    def test_no_split_without_gaps(self, continuous_peptide_pdb):
+        """Test that continuous peptide is not split."""
+        result, mapping = split_chains_at_gaps(continuous_peptide_pdb)
+        assert result == continuous_peptide_pdb
+        assert mapping == {}
+
+    def test_splits_at_single_gap(self, gapped_peptide_pdb):
+        """Test splitting at a single gap."""
+        result, mapping = split_chains_at_gaps(gapped_peptide_pdb)
+
+        # Should have new chain ID for segment after gap
+        assert len(mapping) > 0
+
+        # Check that atoms after gap have new chain ID
+        lines = result.split("\n")
+        chains_found = set()
+        for line in lines:
+            if line.startswith("ATOM") and len(line) > 21:
+                chains_found.add(line[21])
+
+        # Should have at least 2 chains (original A + new segment)
+        assert len(chains_found) >= 2
+
+    def test_splits_at_multiple_gaps(self, multi_gap_pdb):
+        """Test splitting at multiple gaps."""
+        result, mapping = split_chains_at_gaps(multi_gap_pdb)
+
+        # Count unique chains in result
+        lines = result.split("\n")
+        chains_found = set()
+        for line in lines:
+            if line.startswith("ATOM") and len(line) > 21:
+                chains_found.add(line[21])
+
+        # Should have 3 segments (original + 2 new)
+        assert len(chains_found) == 3
+
+    def test_preserves_atom_coordinates(self, gapped_peptide_pdb):
+        """Test that splitting preserves atom coordinates."""
+        result, _ = split_chains_at_gaps(gapped_peptide_pdb)
+
+        # Count atoms in original and result
+        orig_atoms = sum(
+            1
+            for line in gapped_peptide_pdb.split("\n")
+            if line.startswith("ATOM")
+        )
+        result_atoms = sum(
+            1 for line in result.split("\n") if line.startswith("ATOM")
+        )
+
+        assert orig_atoms == result_atoms
+
+    def test_mapping_tracks_original_chain(self, gapped_peptide_pdb):
+        """Test that mapping correctly tracks original chain IDs."""
+        _, mapping = split_chains_at_gaps(gapped_peptide_pdb)
+
+        # All mapped chains should map back to A
+        for orig_chain in mapping.values():
+            assert orig_chain == "A"
+
+
+class TestRestoreChainIds:
+    """Tests for restore_chain_ids function."""
+
+    def test_no_change_without_mapping(self, gapped_peptide_pdb):
+        """Test that PDB is unchanged without mapping."""
+        result = restore_chain_ids(gapped_peptide_pdb, {})
+        assert result == gapped_peptide_pdb
+
+    def test_restores_original_chain_ids(self, gapped_peptide_pdb):
+        """Test that original chain IDs are restored after splitting."""
+        # Split chains
+        split_pdb, mapping = split_chains_at_gaps(gapped_peptide_pdb)
+
+        # Restore original chain IDs
+        restored = restore_chain_ids(split_pdb, mapping)
+
+        # All atoms should be in chain A again
+        for line in restored.split("\n"):
+            if line.startswith("ATOM") and len(line) > 21:
+                assert line[21] == "A"
+
+    def test_roundtrip_preserves_atoms(self, gapped_peptide_pdb):
+        """Test split -> restore roundtrip preserves atom count."""
+        orig_atoms = sum(
+            1
+            for line in gapped_peptide_pdb.split("\n")
+            if line.startswith("ATOM")
+        )
+
+        split_pdb, mapping = split_chains_at_gaps(gapped_peptide_pdb)
+        restored = restore_chain_ids(split_pdb, mapping)
+
+        restored_atoms = sum(
+            1 for line in restored.split("\n") if line.startswith("ATOM")
+        )
+
+        assert orig_atoms == restored_atoms
+
+
+class TestAddTerRecordsAtGaps:
+    """Tests for add_ter_records_at_gaps function."""
+
+    def test_no_ter_without_gaps(self, continuous_peptide_pdb):
+        """Test no TER records added to continuous peptide."""
+        gaps = detect_chain_gaps(continuous_peptide_pdb, check_distance=False)
+        result = add_ter_records_at_gaps(continuous_peptide_pdb, gaps)
+        assert result == continuous_peptide_pdb
+
+    def test_adds_ter_at_gap(self, gapped_peptide_pdb):
+        """Test TER record is added at gap location."""
+        gaps = detect_chain_gaps(gapped_peptide_pdb, check_distance=False)
+        result = add_ter_records_at_gaps(gapped_peptide_pdb, gaps)
+
+        # Count TER records
+        ter_count = sum(
+            1 for line in result.split("\n") if line.startswith("TER")
+        )
+        assert ter_count >= 1
+
+
+class TestGetGapSummary:
+    """Tests for get_gap_summary function."""
+
+    def test_no_gaps_message(self):
+        """Test message when no gaps detected."""
+        summary = get_gap_summary([])
+        assert "No chain gaps detected" in summary
+
+    def test_summary_with_gaps(self):
+        """Test summary includes gap information."""
+        gaps = [
+            ChainGap(chain_id="A", residue_before=10, residue_after=15),
+            ChainGap(chain_id="B", residue_before=5, residue_after=20),
+        ]
+        summary = get_gap_summary(gaps)
+
+        assert "2 chain gap(s)" in summary
+        assert "Chain A" in summary
+        assert "Chain B" in summary

From e1393e7b74dadd0336059d25dcccf8674e23a7d6 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Mon, 12 Jan 2026 15:10:10 +0000
Subject: [PATCH 04/22] Fix chain splitting to assign new chain ID once per
 segment, not per atom

The bug was assigning a new chain ID for every atom at a gap start residue
instead of just once when entering a new segment. Added tracking of
processed gap starts to prevent duplicate chain assignments.
---
 src/graphrelax/chain_gaps.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/src/graphrelax/chain_gaps.py b/src/graphrelax/chain_gaps.py
index 49b15ae..e7404b7 100644
--- a/src/graphrelax/chain_gaps.py
+++ b/src/graphrelax/chain_gaps.py
@@ -171,9 +171,11 @@ def split_chains_at_gaps(
     current_chain_map = {}  # original_chain_id -> current_new_chain_id
     next_id_idx = 0
 
+    # Track which gap starts we've already processed
+    processed_gap_starts = set()
+
     # Process PDB line by line
     output_lines = []
-    current_residue = {}  # chain_id -> (resnum, icode)
 
     for line in pdb_string.split("\n"):
         if line.startswith(("ATOM", "HETATM")) and len(line) > 26:
@@ -181,8 +183,12 @@ def split_chains_at_gaps(
             resnum = int(line[22:26].strip())
             icode = line[26].strip() if len(line) > 26 else ""
 
+            gap_key = (chain_id, resnum, icode)
+
             # Check if this residue starts a new segment (at a gap)
-            if (chain_id, resnum, icode) in gap_starts:
+            # Only process each gap start once
+            if gap_key in gap_starts and gap_key not in processed_gap_starts:
+                processed_gap_starts.add(gap_key)
                 # Assign a new chain ID for this segment
                 if next_id_idx < len(available_ids):
                     new_chain_id = available_ids[next_id_idx]
@@ -209,8 +215,6 @@ def split_chains_at_gaps(
             new_chain_id = current_chain_map[chain_id]
             line = line[:21] + new_chain_id + line[22:]
 
-            current_residue[chain_id] = (resnum, icode)
-
         elif line.startswith("TER") and len(line) > 21:
             # Update TER record chain ID too
             chain_id = line[21]

From 7bb847b45815f1901801df8e13c3c0c0004163ff Mon Sep 17 00:00:00 2001
From: delalamo <diego.delalamo@gmail.com>
Date: Tue, 13 Jan 2026 09:00:25 -0500
Subject: [PATCH 05/22] Fix CI disk space issue in integration tests

- Free up disk space by removing unused .NET, GHC, and Boost packages
- Install CPU-only PyTorch to avoid large CUDA dependencies
- Use --no-cache-dir to minimize pip cache usage

The GitHub Actions runner was running out of disk space when installing
PyTorch with CUDA dependencies (~5-7GB) alongside conda packages.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 .github/workflows/test.yml | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index d7a2a6f..0b0c004 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -41,13 +41,21 @@ jobs:
           activate-environment: test
           auto-activate-base: false
 
+      - name: Free up disk space
+        run: |
+          sudo rm -rf /usr/share/dotnet
+          sudo rm -rf /opt/ghc
+          sudo rm -rf /usr/local/share/boost
+          df -h
+
       - name: Install conda dependencies
         run: |
           conda install -y openmm pdbfixer
 
       - name: Install pip dependencies
         run: |
-          pip install -e .[test]
+          pip install torch --index-url https://download.pytorch.org/whl/cpu
+          pip install --no-cache-dir -e .[test]
 
       - name: Download LigandMPNN weights
         run: |

From 34bb8727868be8f078fe24e86677f2f4da073b07 Mon Sep 17 00:00:00 2001
From: delalamo <diego.delalamo@gmail.com>
Date: Tue, 13 Jan 2026 09:30:41 -0500
Subject: [PATCH 06/22] Add ligand-aware unconstrained minimization support

Enable unconstrained minimization for protein-ligand complexes using
openmmforcefields for small molecule parameterization.

Changes:
- Add include_ligands, ligand_forcefield, and ligand_smiles config options
- Create ligand_utils.py module for ligand extraction and parameterization
- Add _relax_unconstrained_with_ligands method using SystemGenerator
- Update CLI with --include-ligands, --ligand-forcefield, --ligand-smiles
- Add [ligands] optional dependency group to pyproject.toml
- Add unit tests for ligand utilities
- Add ligand_utils.py to pylint exclude (optional dependency imports)

The implementation:
1. Separates protein (ATOM) and ligands (HETATM) from PDB
2. Processes protein with pdbfixer (avoiding terminal detection issues)
3. Parameterizes ligands with OpenFF Sage 2.0 (default) or GAFF2/Espaloma
4. Combines topologies and minimizes together

Usage:
  graphrelax -i complex.pdb -o minimized.pdb --include-ligands
  graphrelax -i complex.pdb -o minimized.pdb --include-ligands \
    --ligand-forcefield gaff-2.11 --ligand-smiles 'LIG:c1ccccc1'

Requires: pip install graphrelax[ligands]

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 .pre-commit-config.yaml        |   2 +-
 pyproject.toml                 |  10 ++
 src/graphrelax/cli.py          |  74 +++++++++--
 src/graphrelax/config.py       |  15 ++-
 src/graphrelax/ligand_utils.py | 236 +++++++++++++++++++++++++++++++++
 src/graphrelax/relaxer.py      | 176 ++++++++++++++++++++++++
 tests/test_ligand_utils.py     | 163 +++++++++++++++++++++++
 7 files changed, 666 insertions(+), 10 deletions(-)
 create mode 100644 src/graphrelax/ligand_utils.py
 create mode 100644 tests/test_ligand_utils.py

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 9b476cb..362fbc1 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -31,7 +31,7 @@ repos:
           - "--enable=C0415"
           - "--score=n"
         name: pylint (no lazy imports)
-        exclude: "^(src/graphrelax/__init__.py|src/graphrelax/cli.py|src/graphrelax/designer.py|src/graphrelax/relaxer.py|src/graphrelax/utils.py|src/graphrelax/LigandMPNN/|tests/)"
+        exclude: "^(src/graphrelax/__init__.py|src/graphrelax/cli.py|src/graphrelax/designer.py|src/graphrelax/relaxer.py|src/graphrelax/utils.py|src/graphrelax/ligand_utils.py|src/graphrelax/LigandMPNN/|tests/)"
 
   - repo: https://github.com/pre-commit/mirrors-prettier
     rev: v4.0.0-alpha.8
diff --git a/pyproject.toml b/pyproject.toml
index 82b2ac4..266e8a0 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -36,6 +36,10 @@ dependencies = [
 
 # Note: pdbfixer is not on PyPI, install via conda:
 #   conda install -c conda-forge pdbfixer
+# For ligand support in unconstrained minimization:
+#   pip install graphrelax[ligands]
+# or:
+#   conda install -c conda-forge openmmforcefields openff-toolkit rdkit
 
 [project.urls]
 Homepage = "https://github.com/delalamo/GraphRelax"
@@ -63,6 +67,12 @@ test = [
     "pytest>=7.0",
     "pytest-cov>=4.1",
 ]
+ligands = [
+    # For ligand parameterization in unconstrained minimization
+    "openmmforcefields>=0.13.0",
+    "openff-toolkit>=0.14.0",
+    "rdkit>=2023.09.1",
+]
 
 [project.scripts]
 graphrelax = "graphrelax.cli:main"
diff --git a/src/graphrelax/cli.py b/src/graphrelax/cli.py
index 870d1c4..0a66860 100644
--- a/src/graphrelax/cli.py
+++ b/src/graphrelax/cli.py
@@ -216,6 +216,36 @@ def create_parser() -> argparse.ArgumentParser:
             "to prevent artificial gap closure during minimization."
         ),
     )
+    relax_group.add_argument(
+        "--include-ligands",
+        action="store_true",
+        help=(
+            "Include ligands in unconstrained minimization using "
+            "openmmforcefields. Requires: pip install openmmforcefields "
+            "openff-toolkit. If not specified with ligands present, "
+            "--constrained-minimization is required."
+        ),
+    )
+    relax_group.add_argument(
+        "--ligand-forcefield",
+        choices=["openff-2.0.0", "gaff-2.11", "espaloma-0.3.0"],
+        default="openff-2.0.0",
+        metavar="FF",
+        help=(
+            "Force field for ligand parameterization (default: openff-2.0.0)."
+            " Options: openff-2.0.0 (Sage), gaff-2.11 (GAFF2), espaloma-0.3.0"
+        ),
+    )
+    relax_group.add_argument(
+        "--ligand-smiles",
+        type=str,
+        action="append",
+        metavar="RESNAME:SMILES",
+        help=(
+            "Provide SMILES for a ligand residue. Format: RESNAME:SMILES. "
+            "Can be used multiple times. Example: --ligand-smiles LIG:SMILES"
+        ),
+    )
 
     # Scoring options
     score_group = parser.add_argument_group("Scoring options")
@@ -307,19 +337,31 @@ def main(args=None) -> int:
     input_format = detect_format(opts.input)
     has_ligands = _check_for_ligands(opts.input, input_format)
 
-    # Validate: ligand_mpnn with ligands requires constrained minimization
+    # Validate: ligand handling requires --include-ligands or --constrained
     uses_relaxation = mode in (
         PipelineMode.RELAX,
         PipelineMode.NO_REPACK,
         PipelineMode.DESIGN,
     )
-    if has_ligands and uses_relaxation and not opts.constrained_minimization:
-        logger.error(
-            "Input PDB contains ligands (HETATM records). "
-            "Unconstrained minimization cannot handle non-standard residues. "
-            "Please use --constrained-minimization flag."
-        )
-        return 1
+    if has_ligands and uses_relaxation:
+        if opts.include_ligands:
+            # Verify openmmforcefields is available
+            try:
+                import openmmforcefields  # noqa: F401
+            except ImportError:
+                logger.error(
+                    "The --include-ligands flag requires openmmforcefields. "
+                    "Install with: pip install openmmforcefields openff-toolkit"
+                )
+                return 1
+            logger.info("Ligand support enabled via openmmforcefields")
+        elif not opts.constrained_minimization:
+            logger.error(
+                "Input PDB contains ligands (HETATM records). Use one of:\n"
+                "  1. --include-ligands (requires openmmforcefields)\n"
+                "  2. --constrained-minimization"
+            )
+            return 1
 
     logger.info(f"Running GraphRelax in {mode.value} mode")
     logger.info(f"Input: {opts.input}")
@@ -333,11 +375,27 @@ def main(args=None) -> int:
         seed=opts.seed,
     )
 
+    # Parse ligand SMILES if provided
+    ligand_smiles = {}
+    if opts.ligand_smiles:
+        for entry in opts.ligand_smiles:
+            if ":" not in entry:
+                logger.error(
+                    f"Invalid --ligand-smiles format: '{entry}'. "
+                    "Expected format: RESNAME:SMILES"
+                )
+                return 1
+            resname, smiles = entry.split(":", 1)
+            ligand_smiles[resname.strip().upper()] = smiles.strip()
+
     relax_config = RelaxConfig(
         stiffness=opts.stiffness,
         max_iterations=opts.max_iterations,
         constrained=opts.constrained_minimization,
         split_chains_at_gaps=not opts.no_split_gaps,
+        include_ligands=opts.include_ligands,
+        ligand_forcefield=opts.ligand_forcefield,
+        ligand_smiles=ligand_smiles,
     )
 
     pipeline_config = PipelineConfig(
diff --git a/src/graphrelax/config.py b/src/graphrelax/config.py
index 7f55c99..c00f37a 100644
--- a/src/graphrelax/config.py
+++ b/src/graphrelax/config.py
@@ -3,7 +3,9 @@
 from dataclasses import dataclass, field
 from enum import Enum
 from pathlib import Path
-from typing import Literal, Optional
+from typing import Dict, Literal, Optional
+
+LigandForceField = Literal["openff-2.0.0", "gaff-2.11", "espaloma-0.3.0"]
 
 
 class PipelineMode(Enum):
@@ -44,6 +46,17 @@ class RelaxConfig:
     split_chains_at_gaps: bool = True  # Split chains at gaps to prevent closure
     # GPU is auto-detected and used when available
 
+    # Ligand support options
+    include_ligands: bool = (
+        False  # Enable ligand parameterization in unconstrained
+    )
+    ligand_forcefield: LigandForceField = (
+        "openff-2.0.0"  # Force field for ligands
+    )
+    ligand_smiles: Dict[str, str] = field(
+        default_factory=dict
+    )  # {resname: SMILES}
+
 
 @dataclass
 class PipelineConfig:
diff --git a/src/graphrelax/ligand_utils.py b/src/graphrelax/ligand_utils.py
new file mode 100644
index 0000000..d5e5e9b
--- /dev/null
+++ b/src/graphrelax/ligand_utils.py
@@ -0,0 +1,236 @@
+"""Utilities for ligand parameterization and handling."""
+
+import io
+import logging
+from dataclasses import dataclass
+from typing import Dict, List, Optional, Tuple
+
+logger = logging.getLogger(__name__)
+
+# Water residue names to exclude from ligand processing
+WATER_RESIDUES = {"HOH", "WAT", "SOL", "TIP3", "TIP4", "SPC"}
+
+
+@dataclass
+class LigandInfo:
+    """Information about a ligand extracted from PDB."""
+
+    resname: str
+    chain_id: str
+    resnum: int
+    pdb_lines: List[str]
+    smiles: Optional[str] = None
+
+
+def extract_ligands_from_pdb(pdb_string: str) -> Tuple[str, List[LigandInfo]]:
+    """
+    Separate protein ATOM records from ligand HETATM records.
+
+    Args:
+        pdb_string: Full PDB string with protein and ligands
+
+    Returns:
+        Tuple of (protein_only_pdb, list_of_ligand_info)
+    """
+    protein_lines = []
+    ligand_lines_by_residue = {}
+
+    for line in pdb_string.split("\n"):
+        if line.startswith("HETATM"):
+            resname = line[17:20].strip()
+            chain_id = line[21] if len(line) > 21 else " "
+            try:
+                resnum = int(line[22:26].strip())
+            except ValueError:
+                resnum = 0
+
+            # Skip water
+            if resname in WATER_RESIDUES:
+                continue
+
+            key = (chain_id, resnum, resname)
+            if key not in ligand_lines_by_residue:
+                ligand_lines_by_residue[key] = []
+            ligand_lines_by_residue[key].append(line)
+        elif line.startswith("END"):
+            pass  # Skip, will add back
+        else:
+            protein_lines.append(line)
+
+    protein_pdb = "\n".join(protein_lines) + "\nEND\n"
+
+    ligands = []
+    for (chain_id, resnum, resname), lines in ligand_lines_by_residue.items():
+        ligands.append(
+            LigandInfo(
+                resname=resname,
+                chain_id=chain_id,
+                resnum=resnum,
+                pdb_lines=lines,
+            )
+        )
+
+    return protein_pdb, ligands
+
+
+def get_common_ligand_smiles() -> Dict[str, str]:
+    """
+    Return SMILES for commonly encountered ligands.
+
+    This helps with ligands where automatic bond perception may fail.
+    """
+    return {
+        # Heme and porphyrins
+        "HEM": (
+            "[Fe+2]12([N-]3C(=C4C(=C([N-]1C(=C5C(C(=C([N-]2C(=C3C)C(=C)"
+            "C)C=C5C)CCC(=O)O)C)C=C6[N-]4C(=C(C)C6C=C)C(C)=C)CCC(=O)O)C)C=C)C"
+        ),
+        "HEC": (
+            "[Fe+2]12([N-]3C(=C4C(=C([N-]1C(=C5C(C(=C([N-]2C(=C3C)C(=C)"
+            "C)C=C5C)CCC(=O)O)C)C=C6[N-]4C(=C(C)C6C=C)C(C)=C)CCC(=O)O)C)C=C)C"
+        ),
+        # Common cofactors
+        "NAD": (
+            "NC(=O)c1ccc[n+](c1)[C@@H]1O[C@H](COP(=O)([O-])OP(=O)([O-])"
+            "OC[C@H]2O[C@H]([C@H](O)[C@@H]2O)n2cnc3c(N)ncnc23)[C@@H](O)[C@H]1O"
+        ),
+        "NAP": (  # NADP
+            "NC(=O)c1ccc[n+](c1)[C@@H]1O[C@H](COP(=O)([O-])OP(=O)([O-])"
+            "OC[C@H]2O[C@H]([C@H](OP(=O)([O-])[O-])[C@@H]2O)n2cnc3c(N)ncnc23)"
+            "[C@@H](O)[C@H]1O"
+        ),
+        "FAD": (
+            "Cc1cc2nc3c(=O)[nH]c(=O)nc-3n(C[C@H](O)[C@H](O)[C@H](O)"
+            "COP(=O)([O-])OP(=O)([O-])OC[C@H]3O[C@H]([C@H](O)[C@@H]3O)"
+            "n3cnc4c(N)ncnc43)c2cc1C"
+        ),
+        "FMN": (
+            "Cc1cc2nc3c(=O)[nH]c(=O)nc-3n(C[C@H](O)[C@H](O)[C@H](O)"
+            "COP(=O)([O-])[O-])c2cc1C"
+        ),
+        "ATP": (
+            "Nc1ncnc2n(cnc12)[C@@H]1O[C@H](COP(=O)([O-])OP(=O)([O-])"
+            "OP(=O)([O-])[O-])[C@@H](O)[C@H]1O"
+        ),
+        "ADP": (
+            "Nc1ncnc2n(cnc12)[C@@H]1O[C@H](COP(=O)([O-])OP(=O)([O-])[O-])"
+            "[C@@H](O)[C@H]1O"
+        ),
+        "AMP": (
+            "Nc1ncnc2n(cnc12)[C@@H]1O[C@H](COP(=O)([O-])[O-])[C@@H](O)[C@H]1O"
+        ),
+        "GTP": (
+            "Nc1nc2n(cnc2c(=O)[nH]1)[C@@H]1O[C@H](COP(=O)([O-])OP(=O)([O-])"
+            "OP(=O)([O-])[O-])[C@@H](O)[C@H]1O"
+        ),
+        # Common ions (simple)
+        "ZN": "[Zn+2]",
+        "MG": "[Mg+2]",
+        "CA": "[Ca+2]",
+        "FE": "[Fe+2]",
+        "FE2": "[Fe+2]",
+        "MN": "[Mn+2]",
+        "CU": "[Cu+2]",
+        "CO": "[Co+2]",
+        # Common small molecules
+        "ACE": "CC(=O)",  # Acetyl
+        "NME": "NC",  # N-methyl
+        "ACT": "CC(=O)[O-]",  # Acetate
+        "GOL": "OCC(O)CO",  # Glycerol
+        "EDO": "OCCO",  # Ethylene glycol
+        "PEG": "COCCOCCOCCO",  # PEG fragment
+        "SO4": "[O-]S(=O)(=O)[O-]",  # Sulfate
+        "PO4": "[O-]P(=O)([O-])[O-]",  # Phosphate
+        "CL": "[Cl-]",  # Chloride
+    }
+
+
+def create_openff_molecule(ligand: LigandInfo, smiles: Optional[str] = None):
+    """
+    Create an OpenFF Toolkit Molecule from ligand info.
+
+    Args:
+        ligand: LigandInfo with PDB coordinates
+        smiles: Optional SMILES string (if known)
+
+    Returns:
+        openff.toolkit.Molecule
+    """
+    from openff.toolkit import Molecule
+
+    if smiles:
+        # Create from SMILES
+        try:
+            mol = Molecule.from_smiles(smiles, allow_undefined_stereo=True)
+            logger.debug(f"Created molecule for {ligand.resname} from SMILES")
+            return mol
+        except Exception as e:
+            logger.warning(f"Failed to create from SMILES: {e}")
+
+    # Try to create from PDB block
+    pdb_block = "\n".join(ligand.pdb_lines) + "\nEND\n"
+
+    try:
+        # Try direct PDB parsing
+        mol = Molecule.from_pdb_file(
+            io.StringIO(pdb_block),
+            allow_undefined_stereo=True,
+        )
+        logger.debug(f"Created molecule for {ligand.resname} from PDB")
+        return mol
+    except Exception as e:
+        logger.debug(f"Direct PDB parsing failed: {e}")
+
+    # Fallback: use RDKit
+    try:
+        mol = _create_molecule_via_rdkit(pdb_block)
+        logger.debug(f"Created molecule for {ligand.resname} via RDKit")
+        return mol
+    except Exception as e:
+        raise ValueError(
+            f"Could not create molecule for ligand {ligand.resname}: {e}"
+        )
+
+
+def _create_molecule_via_rdkit(pdb_block: str):
+    """Create OpenFF Molecule via RDKit from PDB block."""
+    from openff.toolkit import Molecule
+    from rdkit import Chem
+    from rdkit.Chem import AllChem
+
+    # Parse PDB with RDKit
+    mol = Chem.MolFromPDBBlock(pdb_block, removeHs=False, sanitize=False)
+
+    if mol is None:
+        raise ValueError("RDKit could not parse PDB block")
+
+    # Try to sanitize
+    try:
+        Chem.SanitizeMol(mol)
+    except Exception:
+        # Try without hydrogens and re-add them
+        mol = Chem.MolFromPDBBlock(pdb_block, removeHs=True, sanitize=True)
+        if mol is None:
+            raise ValueError("RDKit sanitization failed")
+        mol = Chem.AddHs(mol, addCoords=True)
+        AllChem.EmbedMolecule(mol, randomSeed=42)
+
+    # Convert to OpenFF Molecule
+    return Molecule.from_rdkit(mol, allow_undefined_stereo=True)
+
+
+def ligand_pdb_to_topology(ligand: LigandInfo):
+    """
+    Convert ligand PDB lines to OpenMM topology and positions.
+
+    Args:
+        ligand: LigandInfo with PDB lines
+
+    Returns:
+        Tuple of (topology, positions)
+    """
+    from openmm import app as openmm_app
+
+    pdb_block = "\n".join(ligand.pdb_lines) + "\nEND\n"
+    pdb_file = openmm_app.PDBFile(io.StringIO(pdb_block))
+    return pdb_file.topology, pdb_file.positions
diff --git a/src/graphrelax/relaxer.py b/src/graphrelax/relaxer.py
index de351a5..e4b1a5f 100644
--- a/src/graphrelax/relaxer.py
+++ b/src/graphrelax/relaxer.py
@@ -18,6 +18,21 @@
     split_chains_at_gaps,
 )
 from graphrelax.config import RelaxConfig
+from graphrelax.ligand_utils import (
+    WATER_RESIDUES,
+    create_openff_molecule,
+    extract_ligands_from_pdb,
+    get_common_ligand_smiles,
+    ligand_pdb_to_topology,
+)
+
+# Check if openmmforcefields is available for ligand support
+try:
+    from openmmforcefields.generators import SystemGenerator
+
+    OPENMMFF_AVAILABLE = True
+except ImportError:
+    OPENMMFF_AVAILABLE = False
 
 # Add vendored LigandMPNN to path for OpenFold imports
 # Must happen before importing from openfold
@@ -156,12 +171,30 @@ def _relax_unconstrained(
         No position restraints, no violation checking, uses OpenMM defaults.
         This is the default minimization mode.
 
+        If config.include_ligands is True and ligands are present, uses
+        openmmforcefields for ligand parameterization.
+
         Args:
             pdb_string: PDB file contents as string
 
         Returns:
             Tuple of (relaxed_pdb_string, debug_info, violations)
         """
+        # Check if ligands are present (non-water HETATM records)
+        has_ligands = any(
+            line.startswith("HETATM")
+            and line[17:20].strip() not in WATER_RESIDUES
+            for line in pdb_string.split("\n")
+        )
+
+        if has_ligands and self.config.include_ligands:
+            if not OPENMMFF_AVAILABLE:
+                raise ImportError(
+                    "openmmforcefields is required for ligand support. "
+                    "Install with: pip install openmmforcefields openff-toolkit"
+                )
+            return self._relax_unconstrained_with_ligands(pdb_string)
+
         ENERGY = unit.kilocalories_per_mole
         LENGTH = unit.angstroms
 
@@ -261,6 +294,149 @@ def _relax_unconstrained(
 
         return relaxed_pdb, debug_data, violations
 
+    def _relax_unconstrained_with_ligands(
+        self, pdb_string: str
+    ) -> Tuple[str, dict, np.ndarray]:
+        """
+        Unconstrained OpenMM minimization with ligand support.
+
+        Uses openmmforcefields to parameterize ligands with GAFF2/OpenFF.
+        The protein is processed separately with pdbfixer, then combined
+        with parameterized ligands for minimization.
+
+        Args:
+            pdb_string: PDB file contents as string
+
+        Returns:
+            Tuple of (relaxed_pdb_string, debug_info, violations)
+        """
+        from pdbfixer import PDBFixer
+
+        ENERGY = unit.kilocalories_per_mole
+        LENGTH = unit.angstroms
+
+        use_gpu = self._check_gpu_available()
+
+        logger.info(
+            f"Running unconstrained minimization with ligands "
+            f"(forcefield={self.config.ligand_forcefield}, gpu={use_gpu})"
+        )
+
+        # Step 1: Separate protein and ligands
+        protein_pdb, ligands = extract_ligands_from_pdb(pdb_string)
+        logger.info(
+            f"Found {len(ligands)} ligand(s): {[l.resname for l in ligands]}"
+        )
+
+        # Step 2: Fix protein with pdbfixer (without ligands)
+        fixer = PDBFixer(pdbfile=io.StringIO(protein_pdb))
+        fixer.findMissingResidues()
+        fixer.findMissingAtoms()
+        fixer.addMissingAtoms()
+
+        # Step 3: Create OpenFF molecules for ligands
+        known_smiles = get_common_ligand_smiles()
+        known_smiles.update(self.config.ligand_smiles or {})
+
+        openff_molecules = []
+        for ligand in ligands:
+            smiles = known_smiles.get(ligand.resname)
+            try:
+                mol = create_openff_molecule(ligand, smiles=smiles)
+                openff_molecules.append(mol)
+                logger.debug(f"Created OpenFF molecule for {ligand.resname}")
+            except Exception as e:
+                resname = ligand.resname
+                raise ValueError(
+                    f"Could not parameterize ligand '{resname}' "
+                    f"(chain {ligand.chain_id}, res {ligand.resnum}): {e}\n\n"
+                    f"Options to resolve:\n"
+                    f"  1. Provide SMILES via config: "
+                    f"ligand_smiles={{'{resname}': 'SMILES_STRING'}}\n"
+                    f"  2. Use constrained minimization: constrained=True\n"
+                    f"  3. Remove the ligand from the input PDB\n\n"
+                    f"For common ligands, see: "
+                    f"https://www.ebi.ac.uk/pdbe-srv/pdbechem/"
+                )
+
+        # Step 4: Create combined topology using Modeller
+        modeller = openmm_app.Modeller(fixer.topology, fixer.positions)
+
+        # Add ligands back to modeller
+        for ligand in ligands:
+            ligand_topology, ligand_positions = ligand_pdb_to_topology(ligand)
+            modeller.add(ligand_topology, ligand_positions)
+
+        # Step 5: Create SystemGenerator with ligand support
+        system_generator = SystemGenerator(
+            forcefields=["amber/ff14SB.xml"],
+            small_molecule_forcefield=self.config.ligand_forcefield,
+            molecules=openff_molecules,
+            forcefield_kwargs={
+                "constraints": openmm_app.HBonds,
+                "removeCMMotion": True,
+            },
+        )
+
+        # Add hydrogens
+        modeller.addHydrogens(system_generator.forcefield)
+
+        # Create system
+        system = system_generator.create_system(modeller.topology)
+
+        # Step 6: Run minimization
+        integrator = openmm.LangevinIntegrator(0, 0.01, 0.0)
+        platform = openmm.Platform.getPlatformByName(
+            "CUDA" if use_gpu else "CPU"
+        )
+        simulation = openmm_app.Simulation(
+            modeller.topology, system, integrator, platform
+        )
+        simulation.context.setPositions(modeller.positions)
+
+        # Get initial energy
+        state = simulation.context.getState(getEnergy=True, getPositions=True)
+        einit = state.getPotentialEnergy().value_in_unit(ENERGY)
+        posinit = state.getPositions(asNumpy=True).value_in_unit(LENGTH)
+
+        # Minimize
+        if self.config.max_iterations > 0:
+            simulation.minimizeEnergy(maxIterations=self.config.max_iterations)
+        else:
+            simulation.minimizeEnergy()
+
+        # Get final state
+        state = simulation.context.getState(getEnergy=True, getPositions=True)
+        efinal = state.getPotentialEnergy().value_in_unit(ENERGY)
+        pos = state.getPositions(asNumpy=True).value_in_unit(LENGTH)
+
+        # Calculate RMSD
+        rmsd = np.sqrt(np.sum((posinit - pos) ** 2) / len(posinit))
+
+        # Write output PDB
+        output = io.StringIO()
+        openmm_app.PDBFile.writeFile(
+            simulation.topology, state.getPositions(), output
+        )
+        relaxed_pdb = output.getvalue()
+
+        debug_data = {
+            "initial_energy": einit,
+            "final_energy": efinal,
+            "rmsd": rmsd,
+            "attempts": 1,
+            "ligands_included": [lig.resname for lig in ligands],
+            "ligand_forcefield": self.config.ligand_forcefield,
+        }
+
+        logger.info(
+            f"Minimization complete: E_init={einit:.2f}, "
+            f"E_final={efinal:.2f}, RMSD={rmsd:.3f} A"
+        )
+
+        violations = np.zeros(0)
+        return relaxed_pdb, debug_data, violations
+
     def _relax_direct(self, pdb_string: str) -> Tuple[str, dict, np.ndarray]:
         """
         Direct OpenMM minimization without pdbfixer.
diff --git a/tests/test_ligand_utils.py b/tests/test_ligand_utils.py
new file mode 100644
index 0000000..f83b1b0
--- /dev/null
+++ b/tests/test_ligand_utils.py
@@ -0,0 +1,163 @@
+"""Unit tests for ligand utilities."""
+
+from graphrelax.ligand_utils import (
+    WATER_RESIDUES,
+    LigandInfo,
+    extract_ligands_from_pdb,
+    get_common_ligand_smiles,
+)
+
+
+class TestExtractLigands:
+    """Tests for ligand extraction from PDB."""
+
+    def test_extract_no_ligands(self, small_peptide_pdb_string):
+        """Test extraction from protein-only PDB."""
+        protein_pdb, ligands = extract_ligands_from_pdb(
+            small_peptide_pdb_string
+        )
+
+        assert len(ligands) == 0
+        assert "ATOM" in protein_pdb
+        assert "END" in protein_pdb
+
+    def test_extract_with_ligand(self):
+        """Test extraction of a ligand from PDB."""
+        # fmt: off
+        pdb_with_ligand = (
+            "ATOM      1  N   ALA A   1       0.0   0.0   0.0  1.00  0.00\n"
+            "ATOM      2  CA  ALA A   1       1.5   0.0   0.0  1.00  0.00\n"
+            "ATOM      3  C   ALA A   1       2.0   1.4   0.0  1.00  0.00\n"
+            "ATOM      4  O   ALA A   1       1.2   2.4   0.0  1.00  0.00\n"
+            "HETATM    5 FE   HEM A 200       5.0   5.0   5.0  1.00  0.00\n"
+            "HETATM    6  NA  HEM A 200       3.5   5.0   5.0  1.00  0.00\n"
+            "HETATM    7  NB  HEM A 200       5.0   3.5   5.0  1.00  0.00\n"
+            "END\n"
+        )
+        # fmt: on
+        protein_pdb, ligands = extract_ligands_from_pdb(pdb_with_ligand)
+
+        assert len(ligands) == 1
+        assert ligands[0].resname == "HEM"
+        assert ligands[0].chain_id == "A"
+        assert ligands[0].resnum == 200
+        assert len(ligands[0].pdb_lines) == 3  # FE, NA, NB
+
+        # Check protein PDB doesn't have HETATM
+        assert "HETATM" not in protein_pdb
+        assert "ATOM" in protein_pdb
+
+    def test_water_excluded(self):
+        """Test that water is not extracted as ligand."""
+        # fmt: off
+        pdb_with_water = (
+            "ATOM      1  N   ALA A   1       0.0   0.0   0.0  1.00  0.00\n"
+            "ATOM      2  CA  ALA A   1       1.5   0.0   0.0  1.00  0.00\n"
+            "HETATM    5  O   HOH A 301      10.0  10.0  10.0  1.00  0.00\n"
+            "HETATM    6  O   WAT A 302      11.0  10.0  10.0  1.00  0.00\n"
+            "HETATM    7  O   SOL A 303      12.0  10.0  10.0  1.00  0.00\n"
+            "END\n"
+        )
+        # fmt: on
+        protein_pdb, ligands = extract_ligands_from_pdb(pdb_with_water)
+
+        # Waters should not be in ligand list
+        assert len(ligands) == 0
+        assert not any(lig.resname in WATER_RESIDUES for lig in ligands)
+
+    def test_multiple_ligands(self):
+        """Test extraction of multiple different ligands."""
+        # fmt: off
+        pdb_with_multiple = (
+            "ATOM      1  N   ALA A   1       0.0   0.0   0.0  1.00  0.00\n"
+            "HETATM    5 FE   HEM A 200       5.0   5.0   5.0  1.00  0.00\n"
+            "HETATM    6  N1  NAD B 301       8.0   8.0   8.0  1.00  0.00\n"
+            "END\n"
+        )
+        # fmt: on
+        protein_pdb, ligands = extract_ligands_from_pdb(pdb_with_multiple)
+
+        assert len(ligands) == 2
+        resnames = {lig.resname for lig in ligands}
+        assert resnames == {"HEM", "NAD"}
+
+
+class TestCommonSmiles:
+    """Tests for common ligand SMILES lookup."""
+
+    def test_heme_smiles_present(self):
+        """Test that HEM SMILES is defined."""
+        smiles = get_common_ligand_smiles()
+        assert "HEM" in smiles
+        assert len(smiles["HEM"]) > 0
+        # Should contain iron
+        assert "Fe" in smiles["HEM"]
+
+    def test_atp_smiles_present(self):
+        """Test that ATP SMILES is defined."""
+        smiles = get_common_ligand_smiles()
+        assert "ATP" in smiles
+        # ATP has phosphate groups
+        assert "P" in smiles["ATP"]
+
+    def test_common_cofactors(self):
+        """Test that common cofactors are defined."""
+        smiles = get_common_ligand_smiles()
+        expected = ["NAD", "FAD", "ATP", "ADP", "GTP"]
+        for cofactor in expected:
+            assert cofactor in smiles, f"{cofactor} should be in common SMILES"
+
+    def test_common_ions(self):
+        """Test that common metal ions are defined."""
+        smiles = get_common_ligand_smiles()
+        ions = ["ZN", "MG", "CA", "FE", "MN", "CU"]
+        for ion in ions:
+            assert ion in smiles, f"{ion} should be in common SMILES"
+
+
+class TestLigandInfo:
+    """Tests for LigandInfo dataclass."""
+
+    def test_ligand_info_creation(self):
+        """Test creating a LigandInfo object."""
+        ligand = LigandInfo(
+            resname="HEM",
+            chain_id="A",
+            resnum=200,
+            pdb_lines=[
+                "HETATM    5 FE   HEM A 200       5.000   5.000   5.000"
+            ],
+        )
+        assert ligand.resname == "HEM"
+        assert ligand.chain_id == "A"
+        assert ligand.resnum == 200
+        assert ligand.smiles is None
+
+    def test_ligand_info_with_smiles(self):
+        """Test LigandInfo with optional SMILES."""
+        ligand = LigandInfo(
+            resname="BEN",
+            chain_id="B",
+            resnum=1,
+            pdb_lines=[
+                "HETATM    1  C1  BEN B   1       0.000   0.000   0.000"
+            ],
+            smiles="c1ccccc1",
+        )
+        assert ligand.smiles == "c1ccccc1"
+
+
+class TestWaterResidues:
+    """Tests for water residue constants."""
+
+    def test_common_water_names(self):
+        """Test that common water names are included."""
+        assert "HOH" in WATER_RESIDUES
+        assert "WAT" in WATER_RESIDUES
+        assert "SOL" in WATER_RESIDUES
+
+    def test_tip_models(self):
+        """Test that TIP water models are included."""
+        assert "TIP3" in WATER_RESIDUES
+        assert "TIP4" in WATER_RESIDUES
+        assert "SPC" in WATER_RESIDUES

From a1da6da02bafa1fc756447eb6cf8a37b20798d02 Mon Sep 17 00:00:00 2001
From: delalamo <diego.delalamo@gmail.com>
Date: Tue, 13 Jan 2026 09:35:35 -0500
Subject: [PATCH 07/22] Refactor ligand SMILES: use RDKit parsing instead of
 hardcoded dictionary

Replace the large hardcoded SMILES dictionary with dynamic extraction
from PDB coordinates using RDKit bond perception:

- Rename get_common_ligand_smiles() to get_ion_smiles() (only ions need
  explicit SMILES since they're single atoms without bond info)
- Add is_single_atom_ligand() helper function
- Update create_openff_molecule() to try RDKit parsing first, then fall
  back to OpenFF PDB parsing, using ion lookup only for single atoms
- Update relaxer.py to use the new approach
- Update tests to reflect the refactored functions

This removes ~70 lines of hardcoded SMILES while making the code more
robust - it can now handle any ligand that RDKit can perceive bonds for.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 src/graphrelax/ligand_utils.py | 114 +++++++++++++--------------------
 src/graphrelax/relaxer.py      |  12 ++--
 tests/test_ligand_utils.py     |  74 +++++++++++++--------
 3 files changed, 96 insertions(+), 104 deletions(-)

diff --git a/src/graphrelax/ligand_utils.py b/src/graphrelax/ligand_utils.py
index d5e5e9b..0c5d702 100644
--- a/src/graphrelax/ligand_utils.py
+++ b/src/graphrelax/ligand_utils.py
@@ -73,57 +73,14 @@ def extract_ligands_from_pdb(pdb_string: str) -> Tuple[str, List[LigandInfo]]:
     return protein_pdb, ligands
 
 
-def get_common_ligand_smiles() -> Dict[str, str]:
+def get_ion_smiles() -> Dict[str, str]:
     """
-    Return SMILES for commonly encountered ligands.
+    Return SMILES for common ions.
 
-    This helps with ligands where automatic bond perception may fail.
+    Ions are single atoms that cannot be parsed from PDB coordinates,
+    so we need explicit SMILES for them.
     """
     return {
-        # Heme and porphyrins
-        "HEM": (
-            "[Fe+2]12([N-]3C(=C4C(=C([N-]1C(=C5C(C(=C([N-]2C(=C3C)C(=C)"
-            "C)C=C5C)CCC(=O)O)C)C=C6[N-]4C(=C(C)C6C=C)C(C)=C)CCC(=O)O)C)C=C)C"
-        ),
-        "HEC": (
-            "[Fe+2]12([N-]3C(=C4C(=C([N-]1C(=C5C(C(=C([N-]2C(=C3C)C(=C)"
-            "C)C=C5C)CCC(=O)O)C)C=C6[N-]4C(=C(C)C6C=C)C(C)=C)CCC(=O)O)C)C=C)C"
-        ),
-        # Common cofactors
-        "NAD": (
-            "NC(=O)c1ccc[n+](c1)[C@@H]1O[C@H](COP(=O)([O-])OP(=O)([O-])"
-            "OC[C@H]2O[C@H]([C@H](O)[C@@H]2O)n2cnc3c(N)ncnc23)[C@@H](O)[C@H]1O"
-        ),
-        "NAP": (  # NADP
-            "NC(=O)c1ccc[n+](c1)[C@@H]1O[C@H](COP(=O)([O-])OP(=O)([O-])"
-            "OC[C@H]2O[C@H]([C@H](OP(=O)([O-])[O-])[C@@H]2O)n2cnc3c(N)ncnc23)"
-            "[C@@H](O)[C@H]1O"
-        ),
-        "FAD": (
-            "Cc1cc2nc3c(=O)[nH]c(=O)nc-3n(C[C@H](O)[C@H](O)[C@H](O)"
-            "COP(=O)([O-])OP(=O)([O-])OC[C@H]3O[C@H]([C@H](O)[C@@H]3O)"
-            "n3cnc4c(N)ncnc43)c2cc1C"
-        ),
-        "FMN": (
-            "Cc1cc2nc3c(=O)[nH]c(=O)nc-3n(C[C@H](O)[C@H](O)[C@H](O)"
-            "COP(=O)([O-])[O-])c2cc1C"
-        ),
-        "ATP": (
-            "Nc1ncnc2n(cnc12)[C@@H]1O[C@H](COP(=O)([O-])OP(=O)([O-])"
-            "OP(=O)([O-])[O-])[C@@H](O)[C@H]1O"
-        ),
-        "ADP": (
-            "Nc1ncnc2n(cnc12)[C@@H]1O[C@H](COP(=O)([O-])OP(=O)([O-])[O-])"
-            "[C@@H](O)[C@H]1O"
-        ),
-        "AMP": (
-            "Nc1ncnc2n(cnc12)[C@@H]1O[C@H](COP(=O)([O-])[O-])[C@@H](O)[C@H]1O"
-        ),
-        "GTP": (
-            "Nc1nc2n(cnc2c(=O)[nH]1)[C@@H]1O[C@H](COP(=O)([O-])OP(=O)([O-])"
-            "OP(=O)([O-])[O-])[C@@H](O)[C@H]1O"
-        ),
-        # Common ions (simple)
         "ZN": "[Zn+2]",
         "MG": "[Mg+2]",
         "CA": "[Ca+2]",
@@ -132,60 +89,77 @@ def get_common_ligand_smiles() -> Dict[str, str]:
         "MN": "[Mn+2]",
         "CU": "[Cu+2]",
         "CO": "[Co+2]",
-        # Common small molecules
-        "ACE": "CC(=O)",  # Acetyl
-        "NME": "NC",  # N-methyl
-        "ACT": "CC(=O)[O-]",  # Acetate
-        "GOL": "OCC(O)CO",  # Glycerol
-        "EDO": "OCCO",  # Ethylene glycol
-        "PEG": "COCCOCCOCCO",  # PEG fragment
-        "SO4": "[O-]S(=O)(=O)[O-]",  # Sulfate
-        "PO4": "[O-]P(=O)([O-])[O-]",  # Phosphate
-        "CL": "[Cl-]",  # Chloride
+        "NA": "[Na+]",
+        "K": "[K+]",
+        "CL": "[Cl-]",
     }
 
 
+def is_single_atom_ligand(ligand: LigandInfo) -> bool:
+    """Check if ligand is a single atom (ion)."""
+    return len(ligand.pdb_lines) == 1
+
+
 def create_openff_molecule(ligand: LigandInfo, smiles: Optional[str] = None):
     """
     Create an OpenFF Toolkit Molecule from ligand info.
 
+    Attempts to create the molecule in this order:
+    1. From user-provided SMILES (if given)
+    2. From ion lookup table (for single-atom ligands)
+    3. From PDB coordinates via RDKit bond perception
+    4. From direct OpenFF PDB parsing
+
     Args:
         ligand: LigandInfo with PDB coordinates
-        smiles: Optional SMILES string (if known)
+        smiles: Optional SMILES string (overrides automatic detection)
 
     Returns:
         openff.toolkit.Molecule
     """
     from openff.toolkit import Molecule
 
+    # 1. User-provided SMILES takes precedence
     if smiles:
-        # Create from SMILES
         try:
             mol = Molecule.from_smiles(smiles, allow_undefined_stereo=True)
             logger.debug(f"Created molecule for {ligand.resname} from SMILES")
             return mol
         except Exception as e:
-            logger.warning(f"Failed to create from SMILES: {e}")
+            logger.warning(f"Failed to create from provided SMILES: {e}")
+
+    # 2. Handle ions (single atoms) - need explicit SMILES
+    if is_single_atom_ligand(ligand):
+        ion_smiles = get_ion_smiles()
+        if ligand.resname in ion_smiles:
+            mol = Molecule.from_smiles(
+                ion_smiles[ligand.resname], allow_undefined_stereo=True
+            )
+            logger.debug(f"Created ion molecule for {ligand.resname}")
+            return mol
+        else:
+            raise ValueError(
+                f"Unknown ion '{ligand.resname}'. Provide SMILES via "
+                f"ligand_smiles={{'{ligand.resname}': '[Element+charge]'}}"
+            )
 
-    # Try to create from PDB block
+    # 3. Try RDKit bond perception from PDB coordinates
     pdb_block = "\n".join(ligand.pdb_lines) + "\nEND\n"
+    try:
+        mol = _create_molecule_via_rdkit(pdb_block)
+        logger.debug(f"Created molecule for {ligand.resname} via RDKit")
+        return mol
+    except Exception as e:
+        logger.debug(f"RDKit parsing failed: {e}")
 
+    # 4. Fallback: direct OpenFF PDB parsing
     try:
-        # Try direct PDB parsing
         mol = Molecule.from_pdb_file(
             io.StringIO(pdb_block),
             allow_undefined_stereo=True,
         )
         logger.debug(f"Created molecule for {ligand.resname} from PDB")
         return mol
-    except Exception as e:
-        logger.debug(f"Direct PDB parsing failed: {e}")
-
-    # Fallback: use RDKit
-    try:
-        mol = _create_molecule_via_rdkit(pdb_block)
-        logger.debug(f"Created molecule for {ligand.resname} via RDKit")
-        return mol
     except Exception as e:
         raise ValueError(
             f"Could not create molecule for ligand {ligand.resname}: {e}"
diff --git a/src/graphrelax/relaxer.py b/src/graphrelax/relaxer.py
index e4b1a5f..3373beb 100644
--- a/src/graphrelax/relaxer.py
+++ b/src/graphrelax/relaxer.py
@@ -22,7 +22,6 @@
     WATER_RESIDUES,
     create_openff_molecule,
     extract_ligands_from_pdb,
-    get_common_ligand_smiles,
     ligand_pdb_to_topology,
 )
 
@@ -324,9 +323,8 @@ def _relax_unconstrained_with_ligands(
 
         # Step 1: Separate protein and ligands
         protein_pdb, ligands = extract_ligands_from_pdb(pdb_string)
-        logger.info(
-            f"Found {len(ligands)} ligand(s): {[l.resname for l in ligands]}"
-        )
+        ligand_names = [lig.resname for lig in ligands]
+        logger.info(f"Found {len(ligands)} ligand(s): {ligand_names}")
 
         # Step 2: Fix protein with pdbfixer (without ligands)
         fixer = PDBFixer(pdbfile=io.StringIO(protein_pdb))
@@ -335,12 +333,12 @@ def _relax_unconstrained_with_ligands(
         fixer.addMissingAtoms()
 
         # Step 3: Create OpenFF molecules for ligands
-        known_smiles = get_common_ligand_smiles()
-        known_smiles.update(self.config.ligand_smiles or {})
+        # User-provided SMILES override automatic detection
+        user_smiles = self.config.ligand_smiles or {}
 
         openff_molecules = []
         for ligand in ligands:
-            smiles = known_smiles.get(ligand.resname)
+            smiles = user_smiles.get(ligand.resname)
             try:
                 mol = create_openff_molecule(ligand, smiles=smiles)
                 openff_molecules.append(mol)
diff --git a/tests/test_ligand_utils.py b/tests/test_ligand_utils.py
index f83b1b0..5187941 100644
--- a/tests/test_ligand_utils.py
+++ b/tests/test_ligand_utils.py
@@ -4,7 +4,8 @@
     WATER_RESIDUES,
     LigandInfo,
     extract_ligands_from_pdb,
-    get_common_ligand_smiles,
+    get_ion_smiles,
+    is_single_atom_ligand,
 )
 
 
@@ -82,37 +83,56 @@ def test_multiple_ligands(self):
         assert resnames == {"HEM", "NAD"}
 
 
-class TestCommonSmiles:
-    """Tests for common ligand SMILES lookup."""
-
-    def test_heme_smiles_present(self):
-        """Test that HEM SMILES is defined."""
-        smiles = get_common_ligand_smiles()
-        assert "HEM" in smiles
-        assert len(smiles["HEM"]) > 0
-        # Should contain iron
-        assert "Fe" in smiles["HEM"]
-
-    def test_atp_smiles_present(self):
-        """Test that ATP SMILES is defined."""
-        smiles = get_common_ligand_smiles()
-        assert "ATP" in smiles
-        # ATP has phosphate groups
-        assert "P" in smiles["ATP"]
-
-    def test_common_cofactors(self):
-        """Test that common cofactors are defined."""
-        smiles = get_common_ligand_smiles()
-        expected = ["NAD", "FAD", "ATP", "ADP", "GTP"]
-        for cofactor in expected:
-            assert cofactor in smiles, f"{cofactor} should be in common SMILES"
+class TestIonSmiles:
+    """Tests for ion SMILES lookup."""
 
     def test_common_ions(self):
         """Test that common metal ions are defined."""
-        smiles = get_common_ligand_smiles()
+        smiles = get_ion_smiles()
         ions = ["ZN", "MG", "CA", "FE", "MN", "CU"]
         for ion in ions:
-            assert ion in smiles, f"{ion} should be in common SMILES"
+            assert ion in smiles, f"{ion} should be in ion SMILES"
+
+    def test_ion_smiles_format(self):
+        """Test that ion SMILES have proper format with charge."""
+        smiles = get_ion_smiles()
+        # All ions should have brackets indicating charged species
+        for ion, smi in smiles.items():
+            assert smi.startswith("["), f"{ion} SMILES should start with ["
+            assert smi.endswith("]"), f"{ion} SMILES should end with ]"
+
+    def test_halide_ions(self):
+        """Test that halide ions are defined."""
+        smiles = get_ion_smiles()
+        assert "CL" in smiles
+        assert smiles["CL"] == "[Cl-]"
+
+
+class TestSingleAtomLigand:
+    """Tests for single atom ligand detection."""
+
+    def test_ion_is_single_atom(self):
+        """Test that ions are detected as single atoms."""
+        ligand = LigandInfo(
+            resname="ZN",
+            chain_id="A",
+            resnum=1,
+            pdb_lines=["HETATM    1 ZN   ZN  A   1       0.0   0.0   0.0"],
+        )
+        assert is_single_atom_ligand(ligand)
+
+    def test_heme_is_not_single_atom(self):
+        """Test that multi-atom ligands are not single atoms."""
+        ligand = LigandInfo(
+            resname="HEM",
+            chain_id="A",
+            resnum=1,
+            pdb_lines=[
+                "HETATM    1 FE   HEM A   1       0.0   0.0   0.0",
+                "HETATM    2  NA  HEM A   1       1.0   0.0   0.0",
+            ],
+        )
+        assert not is_single_atom_ligand(ligand)
 
 
 class TestLigandInfo:

From bed3d04d7454156d9d9e76a0f0bb11f6fd586c4a Mon Sep 17 00:00:00 2001
From: delalamo <diego.delalamo@gmail.com>
Date: Tue, 13 Jan 2026 09:43:31 -0500
Subject: [PATCH 08/22] Use try-except imports with helpful error messages for
 conda deps

The ligand libraries (openmmforcefields, openff-toolkit, rdkit) are
only available via conda-forge, not PyPI. Instead of lazy imports,
use top-level try-except blocks with clear ImportError messages that
tell users exactly how to install the missing dependencies:

- ligand_utils.py: Check for openff-toolkit and rdkit at import time
- relaxer.py: Check for openmmforcefields at import time
- Both provide clear conda install commands with version numbers
- pyproject.toml: Updated comment with full conda install command
- cli.py: Removed optional dependency check (now handled at runtime)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 pyproject.toml                 | 14 ++----------
 src/graphrelax/cli.py          | 14 ++----------
 src/graphrelax/ligand_utils.py | 42 ++++++++++++++++++++++++++++------
 src/graphrelax/relaxer.py      | 31 +++++++++++++++++--------
 4 files changed, 61 insertions(+), 40 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 266e8a0..2ee2d87 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -34,12 +34,8 @@ dependencies = [
     "openmm",
 ]
 
-# Note: pdbfixer is not on PyPI, install via conda:
-#   conda install -c conda-forge pdbfixer
-# For ligand support in unconstrained minimization:
-#   pip install graphrelax[ligands]
-# or:
-#   conda install -c conda-forge openmmforcefields openff-toolkit rdkit
+# Note: Some dependencies are only available via conda-forge:
+#   conda install -c conda-forge pdbfixer openmmforcefields openff-toolkit rdkit
 
 [project.urls]
 Homepage = "https://github.com/delalamo/GraphRelax"
@@ -67,12 +63,6 @@ test = [
     "pytest>=7.0",
     "pytest-cov>=4.1",
 ]
-ligands = [
-    # For ligand parameterization in unconstrained minimization
-    "openmmforcefields>=0.13.0",
-    "openff-toolkit>=0.14.0",
-    "rdkit>=2023.09.1",
-]
 
 [project.scripts]
 graphrelax = "graphrelax.cli:main"
diff --git a/src/graphrelax/cli.py b/src/graphrelax/cli.py
index 0a66860..57d25fc 100644
--- a/src/graphrelax/cli.py
+++ b/src/graphrelax/cli.py
@@ -221,8 +221,7 @@ def create_parser() -> argparse.ArgumentParser:
         action="store_true",
         help=(
             "Include ligands in unconstrained minimization using "
-            "openmmforcefields. Requires: pip install openmmforcefields "
-            "openff-toolkit. If not specified with ligands present, "
+            "openmmforcefields. If not specified with ligands present, "
             "--constrained-minimization is required."
         ),
     )
@@ -345,20 +344,11 @@ def main(args=None) -> int:
     )
     if has_ligands and uses_relaxation:
         if opts.include_ligands:
-            # Verify openmmforcefields is available
-            try:
-                import openmmforcefields  # noqa: F401
-            except ImportError:
-                logger.error(
-                    "The --include-ligands flag requires openmmforcefields. "
-                    "Install with: pip install openmmforcefields openff-toolkit"
-                )
-                return 1
             logger.info("Ligand support enabled via openmmforcefields")
         elif not opts.constrained_minimization:
             logger.error(
                 "Input PDB contains ligands (HETATM records). Use one of:\n"
-                "  1. --include-ligands (requires openmmforcefields)\n"
+                "  1. --include-ligands\n"
                 "  2. --constrained-minimization"
             )
             return 1
diff --git a/src/graphrelax/ligand_utils.py b/src/graphrelax/ligand_utils.py
index 0c5d702..a0193fa 100644
--- a/src/graphrelax/ligand_utils.py
+++ b/src/graphrelax/ligand_utils.py
@@ -5,8 +5,37 @@
 from dataclasses import dataclass
 from typing import Dict, List, Optional, Tuple
 
+from openmm import app as openmm_app
+
+# Ligand parameterization dependencies (conda-forge only)
+# conda install -c conda-forge openff-toolkit=0.14.0 rdkit=2023.09.1
+try:
+    from openff.toolkit import Molecule
+    from rdkit import Chem
+    from rdkit.Chem import AllChem
+
+    LIGAND_DEPS_AVAILABLE = True
+except ImportError:
+    LIGAND_DEPS_AVAILABLE = False
+    Molecule = None
+    Chem = None
+    AllChem = None
+
 logger = logging.getLogger(__name__)
 
+
+def _check_ligand_deps():
+    """Raise ImportError if ligand dependencies are not available."""
+    if not LIGAND_DEPS_AVAILABLE:
+        raise ImportError(
+            "Ligand parameterization requires openff-toolkit and rdkit.\n"
+            "These packages are only available via conda-forge:\n\n"
+            "  conda install -c conda-forge "
+            "openff-toolkit=0.14.0 rdkit=2023.09.1\n\n"
+            "See the README for full installation instructions."
+        )
+
+
 # Water residue names to exclude from ligand processing
 WATER_RESIDUES = {"HOH", "WAT", "SOL", "TIP3", "TIP4", "SPC"}
 
@@ -110,14 +139,19 @@ def create_openff_molecule(ligand: LigandInfo, smiles: Optional[str] = None):
     3. From PDB coordinates via RDKit bond perception
     4. From direct OpenFF PDB parsing
 
+    Requires openff-toolkit and rdkit (install via conda-forge).
+
     Args:
         ligand: LigandInfo with PDB coordinates
         smiles: Optional SMILES string (overrides automatic detection)
 
     Returns:
         openff.toolkit.Molecule
+
+    Raises:
+        ImportError: If openff-toolkit or rdkit are not installed.
     """
-    from openff.toolkit import Molecule
+    _check_ligand_deps()
 
     # 1. User-provided SMILES takes precedence
     if smiles:
@@ -168,10 +202,6 @@ def create_openff_molecule(ligand: LigandInfo, smiles: Optional[str] = None):
 
 def _create_molecule_via_rdkit(pdb_block: str):
     """Create OpenFF Molecule via RDKit from PDB block."""
-    from openff.toolkit import Molecule
-    from rdkit import Chem
-    from rdkit.Chem import AllChem
-
     # Parse PDB with RDKit
     mol = Chem.MolFromPDBBlock(pdb_block, removeHs=False, sanitize=False)
 
@@ -203,8 +233,6 @@ def ligand_pdb_to_topology(ligand: LigandInfo):
     Returns:
         Tuple of (topology, positions)
     """
-    from openmm import app as openmm_app
-
     pdb_block = "\n".join(ligand.pdb_lines) + "\nEND\n"
     pdb_file = openmm_app.PDBFile(io.StringIO(pdb_block))
     return pdb_file.topology, pdb_file.positions
diff --git a/src/graphrelax/relaxer.py b/src/graphrelax/relaxer.py
index 3373beb..fd68b03 100644
--- a/src/graphrelax/relaxer.py
+++ b/src/graphrelax/relaxer.py
@@ -10,6 +10,7 @@
 from openmm import Platform
 from openmm import app as openmm_app
 from openmm import openmm, unit
+from pdbfixer import PDBFixer
 
 from graphrelax.chain_gaps import (
     detect_chain_gaps,
@@ -25,13 +26,27 @@
     ligand_pdb_to_topology,
 )
 
-# Check if openmmforcefields is available for ligand support
+# openmmforcefields is only available via conda-forge
+# Install with: conda install -c conda-forge openmmforcefields=0.13.0
 try:
     from openmmforcefields.generators import SystemGenerator
 
     OPENMMFF_AVAILABLE = True
 except ImportError:
     OPENMMFF_AVAILABLE = False
+    SystemGenerator = None
+
+
+def _check_openmmff():
+    """Raise ImportError if openmmforcefields is not available."""
+    if not OPENMMFF_AVAILABLE:
+        raise ImportError(
+            "Ligand support requires openmmforcefields.\n"
+            "This package is only available via conda-forge:\n\n"
+            "  conda install -c conda-forge openmmforcefields=0.13.0\n\n"
+            "See the README for full installation instructions."
+        )
+
 
 # Add vendored LigandMPNN to path for OpenFold imports
 # Must happen before importing from openfold
@@ -187,11 +202,6 @@ def _relax_unconstrained(
         )
 
         if has_ligands and self.config.include_ligands:
-            if not OPENMMFF_AVAILABLE:
-                raise ImportError(
-                    "openmmforcefields is required for ligand support. "
-                    "Install with: pip install openmmforcefields openff-toolkit"
-                )
             return self._relax_unconstrained_with_ligands(pdb_string)
 
         ENERGY = unit.kilocalories_per_mole
@@ -219,8 +229,6 @@ def _relax_unconstrained(
         protein_pdb = "\n".join(atom_lines) + "\nEND\n"
 
         # Use pdbfixer to add missing atoms and terminal groups
-        from pdbfixer import PDBFixer
-
         fixer = PDBFixer(pdbfile=io.StringIO(protein_pdb))
         fixer.findMissingResidues()
         fixer.findMissingAtoms()
@@ -303,13 +311,18 @@ def _relax_unconstrained_with_ligands(
         The protein is processed separately with pdbfixer, then combined
         with parameterized ligands for minimization.
 
+        Requires openmmforcefields (install via conda-forge).
+
         Args:
             pdb_string: PDB file contents as string
 
         Returns:
             Tuple of (relaxed_pdb_string, debug_info, violations)
+
+        Raises:
+            ImportError: If openmmforcefields is not installed.
         """
-        from pdbfixer import PDBFixer
+        _check_openmmff()
 
         ENERGY = unit.kilocalories_per_mole
         LENGTH = unit.angstroms

From 2b3dae358241c2045a7510a48bf49c9c952d160e Mon Sep 17 00:00:00 2001
From: delalamo <diego.delalamo@gmail.com>
Date: Tue, 13 Jan 2026 10:32:04 -0500
Subject: [PATCH 09/22] Add crystallography artifact removal and mamba
 installation docs

- Add artifacts.py with comprehensive artifact detection for buffers,
  cryoprotectants, detergents, lipids, reducing agents, and halide ions
- Auto-remove artifacts by default, preserve biologically relevant ions
- Add --keep-all-ligands and --keep-ligand flags to whitelist residues
- Update README with mamba/micromamba installation instructions
- Add tests for artifact detection and removal

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 .pre-commit-config.yaml        |   2 +-
 README.md                      | 194 ++++++++++++++-----
 environment.yml                |  48 +++++
 requirements.txt               |  49 +++++
 src/graphrelax/artifacts.py    | 336 +++++++++++++++++++++++++++++++++
 src/graphrelax/cli.py          |  54 ++++--
 src/graphrelax/config.py       |  10 +-
 src/graphrelax/ligand_utils.py |  22 ++-
 src/graphrelax/pipeline.py     |  15 ++
 src/graphrelax/relaxer.py      |   7 +-
 tests/test_artifacts.py        | 326 ++++++++++++++++++++++++++++++++
 11 files changed, 991 insertions(+), 72 deletions(-)
 create mode 100644 environment.yml
 create mode 100644 requirements.txt
 create mode 100644 src/graphrelax/artifacts.py
 create mode 100644 tests/test_artifacts.py

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 362fbc1..569ecd2 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -31,7 +31,7 @@ repos:
           - "--enable=C0415"
           - "--score=n"
         name: pylint (no lazy imports)
-        exclude: "^(src/graphrelax/__init__.py|src/graphrelax/cli.py|src/graphrelax/designer.py|src/graphrelax/relaxer.py|src/graphrelax/utils.py|src/graphrelax/ligand_utils.py|src/graphrelax/LigandMPNN/|tests/)"
+        exclude: "^(src/graphrelax/__init__.py|src/graphrelax/cli.py|src/graphrelax/designer.py|src/graphrelax/relaxer.py|src/graphrelax/utils.py|src/graphrelax/ligand_utils.py|src/graphrelax/pipeline.py|src/graphrelax/LigandMPNN/|tests/)"
 
   - repo: https://github.com/pre-commit/mirrors-prettier
     rev: v4.0.0-alpha.8
diff --git a/README.md b/README.md
index 6b0801e..7e8558b 100644
--- a/README.md
+++ b/README.md
@@ -6,13 +6,62 @@ GraphRelax combines **LigandMPNN** (for sequence design and side-chain packing)
 
 ## Installation
 
+### Quick Start (Recommended)
+
+The easiest way to install GraphRelax with all dependencies is using mamba (much faster than conda):
+
+```bash
+# Install mamba if you don't have it (or use micromamba)
+conda install -n base -c conda-forge mamba
+
+# Create environment from file
+mamba env create -f environment.yml
+conda activate graphrelax
+
+# Install GraphRelax
+pip install -e .
+```
+
+Or with **micromamba** (standalone, no conda required):
+
+```bash
+# Install micromamba (see https://mamba.readthedocs.io/en/latest/installation/micromamba-installation.html)
+# Then:
+micromamba create -f environment.yml
+micromamba activate graphrelax
+pip install -e .
+```
+
+This installs all dependencies including ligand support.
+
+### Standard Installation
+
+For basic usage without ligand support:
+
 ```bash
-# Install from PyPI
 pip install graphrelax
 ```
 
 LigandMPNN model weights (~40MB) are downloaded automatically on first run.
 
+### Full Installation (with Ligand Support)
+
+Some dependencies are only available via conda-forge and must be installed before pip:
+
+```bash
+# Step 1: Install conda-forge dependencies (use mamba for speed)
+mamba install -c conda-forge \
+    pdbfixer \
+    openmmforcefields>=0.13.0 \
+    openff-toolkit>=0.14.0 \
+    rdkit>=2023.09.1
+
+# Step 2: Install GraphRelax
+pip install graphrelax
+```
+
+> **Note:** `mamba` is a drop-in replacement for `conda` that's 10-100x faster. Install it with `conda install -n base -c conda-forge mamba`, or use `micromamba` as a standalone tool.
+
 ### Development Installation
 
 ```bash
@@ -20,31 +69,39 @@ LigandMPNN model weights (~40MB) are downloaded automatically on first run.
 git clone https://github.com/your-username/GraphRelax.git
 cd GraphRelax
 
-# Install in editable mode
+# Option A: Use environment.yml with mamba (recommended)
+mamba env create -f environment.yml
+conda activate graphrelax
 pip install -e .
-```
 
-### Optional: Constrained Minimization
-
-If you want to use `--constrained-minimization` mode (AlphaFold-style relaxation with position restraints and violation checking), you also need pdbfixer:
-
-```bash
-# pdbfixer is only available via conda-forge, not PyPI
-conda install -c conda-forge pdbfixer
+# Option B: Manual installation
+mamba install -c conda-forge pdbfixer openmmforcefields openff-toolkit rdkit
+pip install -e .
 ```
 
-### Platform-specific Installation
+### Dependencies
 
-```bash
-# CPU-only (smaller install, no GPU dependencies)
-pip install "graphrelax[cpu]"
+#### Pip packages (installed automatically)
 
-# With CUDA 11 GPU support
-pip install "graphrelax[cuda11]"
+| Package        | Version | Purpose                  |
+| -------------- | ------- | ------------------------ |
+| torch          | >= 2.0  | Neural network inference |
+| numpy          | < 2     | Array operations         |
+| openmm         | latest  | Molecular dynamics       |
+| biopython      | latest  | Structure parsing        |
+| prody          | latest  | Protein analysis         |
+| absl-py        | latest  | Configuration            |
+| ml-collections | latest  | Configuration            |
+| dm-tree        | latest  | Nested data structures   |
 
-# With CUDA 12 GPU support
-pip install "graphrelax[cuda12]"
-```
+#### Conda-forge packages (manual installation required)
+
+| Package           | Version      | Purpose                     | Required for                 |
+| ----------------- | ------------ | --------------------------- | ---------------------------- |
+| pdbfixer          | latest       | Structure preparation       | `--constrained-minimization` |
+| openmmforcefields | >= 0.13.0    | Small molecule force fields | `--include-ligands`          |
+| openff-toolkit    | >= 0.14.0    | OpenFF Molecule creation    | `--include-ligands`          |
+| rdkit             | >= 2023.09.1 | Bond perception from PDB    | `--include-ligands`          |
 
 ### Docker
 
@@ -70,24 +127,6 @@ docker build -t graphrelax .
 docker run --rm graphrelax --help
 ```
 
-### Dependencies
-
-Core dependencies (installed automatically via pip):
-
-- Python >= 3.9
-- PyTorch >= 2.0
-- NumPy < 2.0 (PyTorch <2.5 is incompatible with NumPy 2.x)
-- OpenMM
-- BioPython
-- ProDy
-- dm-tree
-- absl-py
-- ml-collections
-
-Optional (for `--constrained-minimization` only):
-
-- pdbfixer (conda-forge only, not on PyPI)
-
 ## Features
 
 - **FastRelax-like protocol**: Alternate between side-chain repacking and energy minimization
@@ -165,23 +204,77 @@ graphrelax -i input.pdb -o relaxed.pdb --constrained-minimization --stiffness 5.
 | Default (unconstrained)      | No                  | No                 | Fast   | No                |
 | `--constrained-minimization` | Yes (harmonic)      | Yes                | Slower | Yes               |
 
-**Important:** When your input PDB contains ligands or other non-standard residues (HETATM records other than water), you **must** use `--constrained-minimization`. The unconstrained mode uses AMBER force field parameters which don't include templates for non-standard molecules. Constrained mode uses OpenFold's AmberRelaxation which can handle ligands properly.
+### Crystallography Artifact Removal
+
+By default, GraphRelax **automatically removes** common crystallography and cryo-EM artifacts from input structures:
+
+- **Buffers**: SO4, PO4, CIT, ACT, MES, HEPES, Tris, etc.
+- **Cryoprotectants**: GOL (glycerol), EDO (ethylene glycol), MPD, PEG variants, DMSO
+- **Detergents**: DDM, OG, LDAO, CHAPS, SDS, etc.
+- **Lipids/Fatty acids**: PLM (palmitic), MYR (myristic), OLA (oleic), etc.
+- **Reducing agents**: BME, DTT
+- **Halide ions**: CL, BR, IOD
+
+**Biologically relevant metal ions** (ZN, MG, CA, FE, MN, CU, etc.) are **preserved**.
+
+```bash
+# Artifacts are removed by default
+graphrelax -i crystal_structure.pdb -o relaxed.pdb
+
+# Keep all HETATM records including artifacts
+graphrelax -i crystal_structure.pdb -o relaxed.pdb --keep-all-ligands
+
+# Keep specific artifacts that you need
+graphrelax -i crystal_structure.pdb -o relaxed.pdb --keep-ligand GOL --keep-ligand SO4
+```
 
 ### Working with Ligands
 
-When designing proteins with bound ligands (e.g., heme, cofactors, small molecules), use `ligand_mpnn` model type with constrained minimization:
+Biologically relevant ligands (cofactors, substrates, inhibitors) are **auto-detected** and parameterized using openmmforcefields. No special flags are needed.
 
 ```bash
-# Design around a ligand
+# Ligands are automatically included and parameterized
+graphrelax -i protein_with_ligand.pdb -o designed.pdb --design
+
+# Choose a specific force field for ligands
 graphrelax -i protein_with_ligand.pdb -o designed.pdb \
-    --design --model-type ligand_mpnn --constrained-minimization
+    --ligand-forcefield gaff-2.11
 
-# Repack side chains around a ligand
-graphrelax -i protein_with_ligand.pdb -o repacked.pdb \
-    --relax --model-type ligand_mpnn --constrained-minimization
+# Provide SMILES for unknown ligands
+graphrelax -i protein_with_ligand.pdb -o designed.pdb \
+    --ligand-smiles "LIG:c1ccccc1"
+
+# Strip all ligands (protein-only processing)
+graphrelax -i protein_with_ligand.pdb -o designed.pdb --ignore-ligands
+```
+
+**Requires:** `conda install -c conda-forge openmmforcefields openff-toolkit rdkit`
+
+For design around ligands, use `ligand_mpnn` model type:
+
+```bash
+graphrelax -i protein_with_ligand.pdb -o designed.pdb \
+    --design --model-type ligand_mpnn
+```
+
+#### Ligand Force Field Options
+
+| Force Field     | Flag                                 | Description                |
+| --------------- | ------------------------------------ | -------------------------- |
+| OpenFF Sage 2.0 | `--ligand-forcefield openff-2.0.0`   | Modern, accurate (default) |
+| GAFF 2.11       | `--ligand-forcefield gaff-2.11`      | Well-tested, robust        |
+| Espaloma 0.3    | `--ligand-forcefield espaloma-0.3.0` | ML-based, fast             |
+
+#### Alternative: Constrained Minimization
+
+Use `--constrained-minimization` for position-restrained minimization (does not require openmmforcefields):
+
+```bash
+graphrelax -i protein_with_ligand.pdb -o designed.pdb \
+    --design --constrained-minimization
 ```
 
-**Note:** If you attempt to use unconstrained minimization with a PDB containing ligands, GraphRelax will exit with an error message directing you to use `--constrained-minimization`.
+**Requires:** `conda install -c conda-forge pdbfixer`
 
 ### Resfile Format
 
@@ -247,13 +340,22 @@ Relaxation options:
   --constrained-minimization  Use constrained minimization with position
                               restraints (AlphaFold-style). Default is
                               unconstrained. Requires pdbfixer.
-                              **Required when input PDB contains ligands.**
   --stiffness K         Restraint stiffness in kcal/mol/A^2 (default: 10.0)
                         Only applies to constrained minimization.
   --max-iterations N    Max L-BFGS iterations, 0=unlimited (default: 0)
+  --ignore-ligands      Strip all ligands before processing. By default,
+                        ligands are auto-detected and parameterized.
+  --ligand-forcefield FF  Force field for ligands: openff-2.0.0 (default),
+                          gaff-2.11, or espaloma-0.3.0
+  --ligand-smiles RES:SMILES  Provide SMILES for a ligand residue. Can be
+                              used multiple times for multiple ligands.
 
 Input preprocessing:
   --keep-waters         Keep water molecules in input (default: removed)
+  --keep-all-ligands    Keep all HETATM records including crystallography
+                        artifacts. By default, common artifacts are removed.
+  --keep-ligand RESNAME Keep specific ligand residue (can be used multiple
+                        times). Example: --keep-ligand GOL --keep-ligand SO4
 
 Scoring:
   --scorefile FILE      Output scorefile with energy terms
diff --git a/environment.yml b/environment.yml
new file mode 100644
index 0000000..1398256
--- /dev/null
+++ b/environment.yml
@@ -0,0 +1,48 @@
+# GraphRelax Conda Environment
+# ============================
+#
+# Creates a complete environment with all dependencies for GraphRelax,
+# including ligand support.
+#
+# Usage (mamba is 10-100x faster than conda):
+#   mamba env create -f environment.yml
+#   conda activate graphrelax
+#   pip install -e .
+#
+# Or with micromamba (standalone, no conda required):
+#   micromamba create -f environment.yml
+#   micromamba activate graphrelax
+#   pip install -e .
+#
+# For GPU support, also install CUDA-enabled PyTorch:
+#   pip install torch --index-url https://download.pytorch.org/whl/cu118
+
+name: graphrelax
+channels:
+  - conda-forge
+  - defaults
+dependencies:
+  # Python version
+  - python>=3.9
+
+  # Conda-forge only packages (not on PyPI)
+  - pdbfixer
+  - openmmforcefields>=0.13.0
+  - openff-toolkit>=0.14.0
+  - rdkit>=2023.09.1
+
+  # Core dependencies (also available via pip)
+  - openmm
+  - numpy<2
+  - biopython
+  - prody
+
+  # pip dependencies
+  - pip
+  - pip:
+      - torch>=2.0
+      - absl-py
+      - ml-collections
+      - dm-tree
+      # Install graphrelax itself
+      - -e .
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..3c80844
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,49 @@
+# GraphRelax Requirements
+# ======================
+#
+# This file lists ALL dependencies needed to run GraphRelax with full
+# functionality, including ligand support.
+#
+# INSTALLATION ORDER:
+# 1. First install conda-forge packages (not available on PyPI)
+# 2. Then install pip packages
+#
+# Quick install (recommended):
+#   conda install -c conda-forge pdbfixer openmmforcefields openff-toolkit rdkit
+#   pip install -r requirements.txt
+#
+# Or use the environment.yml file for a complete conda environment.
+
+# =============================================================================
+# CONDA-FORGE ONLY (must be installed via conda before pip)
+# =============================================================================
+# These packages are NOT on PyPI and must be installed via conda-forge:
+#
+#   conda install -c conda-forge \
+#       pdbfixer \
+#       openmmforcefields>=0.13.0 \
+#       openff-toolkit>=0.14.0 \
+#       rdkit>=2023.09.1
+#
+# pdbfixer: Required for unconstrained minimization and ligand support
+# openmmforcefields: Required for --include-ligands (ligand force fields)
+# openff-toolkit: Required for --include-ligands (molecule parameterization)
+# rdkit: Required for --include-ligands (bond perception from PDB)
+
+# =============================================================================
+# PIP PACKAGES (install after conda packages)
+# =============================================================================
+
+# Core dependencies
+torch>=2.0
+numpy<2  # PyTorch <2.5 is incompatible with NumPy 2.x
+openmm
+biopython
+prody
+absl-py
+ml-collections
+dm-tree
+
+# Development/testing (optional)
+# pytest>=7.0
+# pytest-cov>=4.1
diff --git a/src/graphrelax/artifacts.py b/src/graphrelax/artifacts.py
new file mode 100644
index 0000000..f97e48c
--- /dev/null
+++ b/src/graphrelax/artifacts.py
@@ -0,0 +1,336 @@
+"""Constants and utilities for crystallography artifact detection and removal.
+
+Common crystallography and cryo-EM artifacts (buffers, cryoprotectants,
+detergents, lipids) are stripped by default during preprocessing.
+Use --keep-all-ligands to preserve them.
+"""
+
+import logging
+from collections import defaultdict
+from typing import Dict, Optional, Set, Tuple
+
+logger = logging.getLogger(__name__)
+
+# =============================================================================
+# Artifact Categories
+# =============================================================================
+
+# Buffer components commonly found in crystallization conditions
+BUFFER_ARTIFACTS = frozenset(
+    {
+        # Sulfate/Phosphate
+        "SO4",
+        "PO4",
+        "PO3",
+        "PI",
+        "2HP",
+        "2PO",
+        "SPO",
+        "SPH",
+        # Organic acids
+        "CIT",
+        "FLC",
+        "ACT",
+        "ACE",
+        "FMT",
+        "FOR",
+        "NO3",
+        "AZI",
+        # HEPES, MES, Tris, etc.
+        "MES",
+        "EPE",
+        "TRS",
+        "CAC",
+        "BIS",
+        "HEZ",
+        "MOH",
+        "BTB",
+        "TAM",
+        "MRD",
+        "PGO",
+        "144",
+        "IPS",
+        # Malonate, Tartrate, etc.
+        "MLI",
+        "TAR",
+        "MAL",
+        "SUC",
+        "FUM",
+        # Borate
+        "BO3",
+        "BO4",
+        # Imidazole (from His-tag purification)
+        "IMD",
+        "1MZ",
+    }
+)
+
+# Cryoprotectants used for flash-cooling crystals
+CRYOPROTECTANT_ARTIFACTS = frozenset(
+    {
+        # Glycerol and glycols
+        "GOL",
+        "EDO",
+        "EGL",
+        "PGR",
+        "PGQ",
+        "GLC",
+        # MPD (2-Methyl-2,4-pentanediol)
+        "MPD",
+        "1PG",
+        "PDO",
+        # PEG variants (polyethylene glycol fragments)
+        "PEG",
+        "PGE",
+        "PG4",
+        "1PE",
+        "P6G",
+        "P33",
+        "PE4",
+        "PG0",
+        "2PE",
+        "PEU",
+        "PE8",
+        "PE5",
+        "XPE",
+        "12P",
+        "15P",
+        "PG5",
+        "PG6",
+        "PEE",
+        "PE3",
+        "P4G",
+        "P2E",
+        # DMSO
+        "DMS",
+        # Isopropanol
+        "IPA",
+    }
+)
+
+# Detergents used for membrane protein crystallization
+DETERGENT_ARTIFACTS = frozenset(
+    {
+        # Maltosides (DDM, DM, UDM, etc.)
+        "LMT",
+        "MLA",
+        "BMA",
+        "TRE",
+        "DDM",
+        "DXM",
+        # Glucosides (OG, NG, etc.)
+        "BOG",
+        "BGC",
+        "NDG",
+        "HTG",
+        "OGA",
+        "NGS",
+        # LDAO (Lauryldimethylamine-N-oxide)
+        "LDA",
+        "DAO",
+        # CHAPS/CHAPSO
+        "CPS",
+        "CHT",
+        "SDS",
+        "CHP",
+        # Triton, digitonin
+        "TRT",
+        "T3A",
+        "D10",
+        "D12",
+        "DGT",
+        # LMNG, cymals
+        "MNG",
+        "CYC",
+        "LMG",
+        # C12E8/9 polyoxyethylene
+        "CE9",
+        "C8E",
+        "C10",
+        "C12",
+        # Octyl glucoside
+        "OLC",
+    }
+)
+
+# Lipids and fatty acids (from LCP crystallization or membrane proteins)
+LIPID_ARTIFACTS = frozenset(
+    {
+        # Common fatty acids
+        "PLM",
+        "MYR",
+        "OLA",
+        "STE",
+        "PAL",
+        "LNL",
+        "ARA",
+        "DCA",
+        "UND",
+        "MYS",
+        "MYA",
+        "LNO",
+        "EOA",
+        "PEF",
+        # Monoolein (lipidic cubic phase crystallization)
+        "OLB",
+        "9OL",
+        "MPG",
+        "OLI",
+        # Phospholipids and fragments
+        "PC",
+        "PE",
+        "PG",
+        "PS",
+        "PLC",
+        "EPH",
+        "CDL",
+        # Cholesterol
+        "CLR",
+        "CHO",
+    }
+)
+
+# Reducing agents from protein preparation
+REDUCING_AGENT_ARTIFACTS = frozenset(
+    {
+        "BME",
+        "DTT",
+        "DTU",
+        "TCE",
+        "TRO",
+        "GSH",
+    }
+)
+
+# Halide ions (often crystallization additives, not biologically relevant)
+HALIDE_ARTIFACTS = frozenset(
+    {
+        "CL",
+        "BR",
+        "IOD",
+        "F",
+    }
+)
+
+# Unknown/placeholder atoms
+UNKNOWN_ARTIFACTS = frozenset(
+    {
+        "UNX",
+        "UNL",
+        "UNK",
+        "DUM",
+    }
+)
+
+# =============================================================================
+# Combined Sets
+# =============================================================================
+
+# Master set of all artifacts to remove by default
+CRYSTALLOGRAPHY_ARTIFACTS = (
+    BUFFER_ARTIFACTS
+    | CRYOPROTECTANT_ARTIFACTS
+    | DETERGENT_ARTIFACTS
+    | LIPID_ARTIFACTS
+    | REDUCING_AGENT_ARTIFACTS
+    | HALIDE_ARTIFACTS
+    | UNKNOWN_ARTIFACTS
+)
+
+# Biologically relevant ions - NOT stripped by default
+# These are often structurally/functionally important
+BIOLOGICALLY_RELEVANT_IONS = frozenset(
+    {
+        "ZN",
+        "MG",
+        "CA",
+        "FE",
+        "FE2",
+        "MN",
+        "CU",
+        "CO",
+        "NI",
+        "MO",
+        "NA",
+        "K",  # Often biologically relevant in channels/pumps
+    }
+)
+
+# Water residues (handled separately by remove_waters)
+WATER_RESIDUES = frozenset({"HOH", "WAT", "SOL", "TIP3", "TIP4", "SPC"})
+
+
+# =============================================================================
+# Removal Functions
+# =============================================================================
+
+
+def remove_artifacts(
+    pdb_string: str,
+    keep_residues: Optional[Set[str]] = None,
+) -> Tuple[str, Dict[str, int]]:
+    """
+    Remove crystallography artifacts from PDB string.
+
+    Artifacts are identified by their residue name (columns 17-20 in PDB
+    format). HETATM records with residue names in CRYSTALLOGRAPHY_ARTIFACTS
+    are removed unless they appear in keep_residues.
+
+    Args:
+        pdb_string: PDB file contents as string
+        keep_residues: Set of residue names to preserve (whitelist)
+
+    Returns:
+        Tuple of:
+            - filtered_pdb: PDB string with artifacts removed
+            - removed_counts: Dict mapping residue name to atom count
+    """
+    if keep_residues is None:
+        keep_residues = set()
+
+    # Normalize whitelist to uppercase
+    keep_residues = {r.upper() for r in keep_residues}
+
+    # Residues to remove = artifacts minus whitelist
+    to_remove = CRYSTALLOGRAPHY_ARTIFACTS - keep_residues
+
+    kept_lines = []
+    removed_counts = defaultdict(int)
+
+    for line in pdb_string.split("\n"):
+        if line.startswith("HETATM"):
+            resname = line[17:20].strip().upper()
+            if resname in to_remove:
+                removed_counts[resname] += 1
+                continue
+        kept_lines.append(line)
+
+    filtered_pdb = "\n".join(kept_lines)
+
+    return filtered_pdb, dict(removed_counts)
+
+
+def is_artifact(resname: str) -> bool:
+    """
+    Check if a residue name is a known crystallography artifact.
+
+    Args:
+        resname: Three-letter residue code
+
+    Returns:
+        True if the residue is in CRYSTALLOGRAPHY_ARTIFACTS
+    """
+    return resname.upper() in CRYSTALLOGRAPHY_ARTIFACTS
+
+
+def is_biologically_relevant_ion(resname: str) -> bool:
+    """
+    Check if a residue name is a biologically relevant ion.
+
+    Args:
+        resname: Three-letter residue code
+
+    Returns:
+        True if the residue is in BIOLOGICALLY_RELEVANT_IONS
+    """
+    return resname.upper() in BIOLOGICALLY_RELEVANT_IONS
diff --git a/src/graphrelax/cli.py b/src/graphrelax/cli.py
index 57d25fc..4653f5b 100644
--- a/src/graphrelax/cli.py
+++ b/src/graphrelax/cli.py
@@ -217,12 +217,12 @@ def create_parser() -> argparse.ArgumentParser:
         ),
     )
     relax_group.add_argument(
-        "--include-ligands",
+        "--ignore-ligands",
         action="store_true",
         help=(
-            "Include ligands in unconstrained minimization using "
-            "openmmforcefields. If not specified with ligands present, "
-            "--constrained-minimization is required."
+            "Strip all ligands (HETATM records) before processing. "
+            "By default, ligands are auto-detected and parameterized "
+            "using openmmforcefields."
         ),
     )
     relax_group.add_argument(
@@ -262,6 +262,26 @@ def create_parser() -> argparse.ArgumentParser:
         action="store_true",
         help="Keep water molecules in input (default: waters are removed)",
     )
+    preprocess_group.add_argument(
+        "--keep-all-ligands",
+        action="store_true",
+        help=(
+            "Keep all HETATM records including crystallography artifacts "
+            "(buffers, cryoprotectants, detergents, lipids). "
+            "By default, common artifacts are removed."
+        ),
+    )
+    preprocess_group.add_argument(
+        "--keep-ligand",
+        type=str,
+        action="append",
+        metavar="RESNAME",
+        help=(
+            "Keep a specific ligand residue that would otherwise be stripped "
+            "as an artifact. Can be used multiple times. "
+            "Example: --keep-ligand GOL --keep-ligand SO4"
+        ),
+    )
 
     # General options
     general_group = parser.add_argument_group("General options")
@@ -336,22 +356,19 @@ def main(args=None) -> int:
     input_format = detect_format(opts.input)
     has_ligands = _check_for_ligands(opts.input, input_format)
 
-    # Validate: ligand handling requires --include-ligands or --constrained
+    # Log ligand handling info
     uses_relaxation = mode in (
         PipelineMode.RELAX,
         PipelineMode.NO_REPACK,
         PipelineMode.DESIGN,
     )
     if has_ligands and uses_relaxation:
-        if opts.include_ligands:
-            logger.info("Ligand support enabled via openmmforcefields")
-        elif not opts.constrained_minimization:
-            logger.error(
-                "Input PDB contains ligands (HETATM records). Use one of:\n"
-                "  1. --include-ligands\n"
-                "  2. --constrained-minimization"
-            )
-            return 1
+        if opts.ignore_ligands:
+            logger.info("Ligands will be stripped (--ignore-ligands)")
+        elif opts.constrained_minimization:
+            logger.info("Ligands handled via constrained minimization")
+        else:
+            logger.info("Ligands auto-detected, using openmmforcefields")
 
     logger.info(f"Running GraphRelax in {mode.value} mode")
     logger.info(f"Input: {opts.input}")
@@ -383,11 +400,16 @@ def main(args=None) -> int:
         max_iterations=opts.max_iterations,
         constrained=opts.constrained_minimization,
         split_chains_at_gaps=not opts.no_split_gaps,
-        include_ligands=opts.include_ligands,
+        ignore_ligands=opts.ignore_ligands,
         ligand_forcefield=opts.ligand_forcefield,
         ligand_smiles=ligand_smiles,
     )
 
+    # Build keep_residues set from --keep-ligand flags
+    keep_residues = set()
+    if opts.keep_ligand:
+        keep_residues = {r.upper() for r in opts.keep_ligand}
+
     pipeline_config = PipelineConfig(
         mode=mode,
         n_iterations=opts.n_iter,
@@ -395,6 +417,8 @@ def main(args=None) -> int:
         scorefile=opts.scorefile,
         verbose=opts.verbose,
         remove_waters=not opts.keep_waters,
+        remove_artifacts=not opts.keep_all_ligands,
+        keep_residues=keep_residues,
         design=design_config,
         relax=relax_config,
     )
diff --git a/src/graphrelax/config.py b/src/graphrelax/config.py
index c00f37a..909f43b 100644
--- a/src/graphrelax/config.py
+++ b/src/graphrelax/config.py
@@ -3,7 +3,7 @@
 from dataclasses import dataclass, field
 from enum import Enum
 from pathlib import Path
-from typing import Dict, Literal, Optional
+from typing import Dict, Literal, Optional, Set
 
 LigandForceField = Literal["openff-2.0.0", "gaff-2.11", "espaloma-0.3.0"]
 
@@ -46,10 +46,8 @@ class RelaxConfig:
     split_chains_at_gaps: bool = True  # Split chains at gaps to prevent closure
     # GPU is auto-detected and used when available
 
-    # Ligand support options
-    include_ligands: bool = (
-        False  # Enable ligand parameterization in unconstrained
-    )
+    # Ligand support options (ligands are auto-detected)
+    ignore_ligands: bool = False  # If True, strip all ligands before processing
     ligand_forcefield: LigandForceField = (
         "openff-2.0.0"  # Force field for ligands
     )
@@ -68,5 +66,7 @@ class PipelineConfig:
     scorefile: Optional[Path] = None  # If set, write scores to this file
     verbose: bool = False
     remove_waters: bool = True  # Remove water molecules from input
+    remove_artifacts: bool = True  # Remove crystallography artifacts by default
+    keep_residues: Set[str] = field(default_factory=set)  # Whitelist residues
     design: DesignConfig = field(default_factory=DesignConfig)
     relax: RelaxConfig = field(default_factory=RelaxConfig)
diff --git a/src/graphrelax/ligand_utils.py b/src/graphrelax/ligand_utils.py
index a0193fa..6c13ef8 100644
--- a/src/graphrelax/ligand_utils.py
+++ b/src/graphrelax/ligand_utils.py
@@ -51,22 +51,36 @@ class LigandInfo:
     smiles: Optional[str] = None
 
 
-def extract_ligands_from_pdb(pdb_string: str) -> Tuple[str, List[LigandInfo]]:
+def extract_ligands_from_pdb(
+    pdb_string: str,
+    exclude_artifacts: bool = True,
+) -> Tuple[str, List[LigandInfo]]:
     """
     Separate protein ATOM records from ligand HETATM records.
 
+    By default, crystallography artifacts (buffers, cryoprotectants, detergents,
+    lipids) are excluded from the ligand list since they cannot be meaningfully
+    parameterized for minimization.
+
     Args:
         pdb_string: Full PDB string with protein and ligands
+        exclude_artifacts: If True, skip known crystallography artifacts
 
     Returns:
         Tuple of (protein_only_pdb, list_of_ligand_info)
     """
+    # Import here to avoid circular imports
+    if exclude_artifacts:
+        from graphrelax.artifacts import CRYSTALLOGRAPHY_ARTIFACTS
+    else:
+        CRYSTALLOGRAPHY_ARTIFACTS = set()
+
     protein_lines = []
     ligand_lines_by_residue = {}
 
     for line in pdb_string.split("\n"):
         if line.startswith("HETATM"):
-            resname = line[17:20].strip()
+            resname = line[17:20].strip().upper()
             chain_id = line[21] if len(line) > 21 else " "
             try:
                 resnum = int(line[22:26].strip())
@@ -77,6 +91,10 @@ def extract_ligands_from_pdb(pdb_string: str) -> Tuple[str, List[LigandInfo]]:
             if resname in WATER_RESIDUES:
                 continue
 
+            # Skip known artifacts (they won't be parameterized)
+            if resname in CRYSTALLOGRAPHY_ARTIFACTS:
+                continue
+
             key = (chain_id, resnum, resname)
             if key not in ligand_lines_by_residue:
                 ligand_lines_by_residue[key] = []
diff --git a/src/graphrelax/pipeline.py b/src/graphrelax/pipeline.py
index 7e50cc1..e03ea28 100644
--- a/src/graphrelax/pipeline.py
+++ b/src/graphrelax/pipeline.py
@@ -182,6 +182,21 @@ def _run_single_output(
             if removed > 0:
                 logger.info(f"Removed {removed} water-related lines from input")
 
+        # Remove crystallography artifacts if requested
+        if self.config.remove_artifacts:
+            from graphrelax.artifacts import remove_artifacts
+
+            current_structure, removed_counts = remove_artifacts(
+                current_structure,
+                keep_residues=self.config.keep_residues,
+            )
+            if removed_counts:
+                total = sum(removed_counts.values())
+                artifacts = ", ".join(
+                    f"{k}({v})" for k, v in sorted(removed_counts.items())
+                )
+                logger.info(f"Removed {total} artifact atoms: {artifacts}")
+
         # Convert to PDB format for internal processing if needed
         current_pdb = ensure_pdb_format(current_structure, input_pdb)
 
diff --git a/src/graphrelax/relaxer.py b/src/graphrelax/relaxer.py
index fd68b03..41f46e9 100644
--- a/src/graphrelax/relaxer.py
+++ b/src/graphrelax/relaxer.py
@@ -185,8 +185,8 @@ def _relax_unconstrained(
         No position restraints, no violation checking, uses OpenMM defaults.
         This is the default minimization mode.
 
-        If config.include_ligands is True and ligands are present, uses
-        openmmforcefields for ligand parameterization.
+        Ligands are auto-detected. If present and ignore_ligands is False,
+        uses openmmforcefields for ligand parameterization.
 
         Args:
             pdb_string: PDB file contents as string
@@ -201,7 +201,8 @@ def _relax_unconstrained(
             for line in pdb_string.split("\n")
         )
 
-        if has_ligands and self.config.include_ligands:
+        # Auto-detect ligands and use openmmforcefields unless ignore_ligands
+        if has_ligands and not self.config.ignore_ligands:
             return self._relax_unconstrained_with_ligands(pdb_string)
 
         ENERGY = unit.kilocalories_per_mole
diff --git a/tests/test_artifacts.py b/tests/test_artifacts.py
new file mode 100644
index 0000000..ded13bb
--- /dev/null
+++ b/tests/test_artifacts.py
@@ -0,0 +1,326 @@
+"""Tests for crystallography artifact detection and removal."""
+
+from graphrelax.artifacts import (
+    BIOLOGICALLY_RELEVANT_IONS,
+    BUFFER_ARTIFACTS,
+    CRYOPROTECTANT_ARTIFACTS,
+    CRYSTALLOGRAPHY_ARTIFACTS,
+    DETERGENT_ARTIFACTS,
+    HALIDE_ARTIFACTS,
+    LIPID_ARTIFACTS,
+    REDUCING_AGENT_ARTIFACTS,
+    WATER_RESIDUES,
+    is_artifact,
+    is_biologically_relevant_ion,
+    remove_artifacts,
+)
+
+
+class TestArtifactConstants:
+    """Tests for artifact constant definitions."""
+
+    def test_common_buffers_included(self):
+        """Test that common buffer artifacts are in the set."""
+        buffers = ["SO4", "PO4", "CIT", "ACT", "MES", "EPE", "TRS"]
+        for buf in buffers:
+            assert (
+                buf in BUFFER_ARTIFACTS
+            ), f"{buf} should be in BUFFER_ARTIFACTS"
+
+    def test_common_cryoprotectants_included(self):
+        """Test that common cryoprotectants are in the set."""
+        cryo = ["GOL", "EDO", "MPD", "PEG", "DMS"]
+        for c in cryo:
+            assert (
+                c in CRYOPROTECTANT_ARTIFACTS
+            ), f"{c} should be in CRYOPROTECTANT_ARTIFACTS"
+
+    def test_common_detergents_included(self):
+        """Test that common detergents are in the set."""
+        detergents = ["BOG", "LDA", "SDS"]
+        for d in detergents:
+            assert (
+                d in DETERGENT_ARTIFACTS
+            ), f"{d} should be in DETERGENT_ARTIFACTS"
+
+    def test_common_lipids_included(self):
+        """Test that common lipids/fatty acids are in the set."""
+        lipids = ["PLM", "MYR", "OLA", "STE"]
+        for lip in lipids:
+            assert lip in LIPID_ARTIFACTS, f"{lip} should be in LIPID_ARTIFACTS"
+
+    def test_reducing_agents_included(self):
+        """Test that common reducing agents are in the set."""
+        agents = ["BME", "DTT"]
+        for a in agents:
+            assert (
+                a in REDUCING_AGENT_ARTIFACTS
+            ), f"{a} should be in REDUCING_AGENT_ARTIFACTS"
+
+    def test_halides_included(self):
+        """Test that halide ions are in the set."""
+        halides = ["CL", "BR", "IOD", "F"]
+        for h in halides:
+            assert h in HALIDE_ARTIFACTS, f"{h} should be in HALIDE_ARTIFACTS"
+
+    def test_metal_ions_not_in_artifacts(self):
+        """Test that biologically relevant metal ions are NOT artifacts."""
+        metals = ["ZN", "MG", "CA", "FE", "MN", "CU"]
+        for m in metals:
+            assert (
+                m not in CRYSTALLOGRAPHY_ARTIFACTS
+            ), f"{m} should NOT be in CRYSTALLOGRAPHY_ARTIFACTS"
+
+    def test_biologically_relevant_ions_defined(self):
+        """Test that biologically relevant ions are in their own set."""
+        ions = ["ZN", "MG", "CA", "FE", "MN", "CU", "CO", "NI"]
+        for ion in ions:
+            assert (
+                ion in BIOLOGICALLY_RELEVANT_IONS
+            ), f"{ion} should be in BIOLOGICALLY_RELEVANT_IONS"
+
+    def test_master_set_contains_all_categories(self):
+        """Test that CRYSTALLOGRAPHY_ARTIFACTS combines all category sets."""
+        all_categories = (
+            BUFFER_ARTIFACTS
+            | CRYOPROTECTANT_ARTIFACTS
+            | DETERGENT_ARTIFACTS
+            | LIPID_ARTIFACTS
+            | REDUCING_AGENT_ARTIFACTS
+            | HALIDE_ARTIFACTS
+        )
+        for artifact in all_categories:
+            assert (
+                artifact in CRYSTALLOGRAPHY_ARTIFACTS
+            ), f"{artifact} should be in CRYSTALLOGRAPHY_ARTIFACTS"
+
+    def test_water_residues_separate(self):
+        """Test that water residues are in their own set."""
+        waters = ["HOH", "WAT", "SOL"]
+        for w in waters:
+            assert w in WATER_RESIDUES, f"{w} should be in WATER_RESIDUES"
+            # Waters should NOT be in crystallography artifacts
+            assert (
+                w not in CRYSTALLOGRAPHY_ARTIFACTS
+            ), f"{w} should NOT be in CRYSTALLOGRAPHY_ARTIFACTS"
+
+
+class TestIsArtifact:
+    """Tests for is_artifact function."""
+
+    def test_glycerol_is_artifact(self):
+        """Test that glycerol is detected as artifact."""
+        assert is_artifact("GOL")
+        assert is_artifact("gol")  # case insensitive
+
+    def test_sulfate_is_artifact(self):
+        """Test that sulfate is detected as artifact."""
+        assert is_artifact("SO4")
+
+    def test_zinc_is_not_artifact(self):
+        """Test that zinc is not detected as artifact."""
+        assert not is_artifact("ZN")
+
+    def test_heme_is_not_artifact(self):
+        """Test that heme is not detected as artifact."""
+        assert not is_artifact("HEM")
+
+
+class TestIsBiologicallyRelevantIon:
+    """Tests for is_biologically_relevant_ion function."""
+
+    def test_zinc_is_relevant(self):
+        """Test that zinc is detected as relevant."""
+        assert is_biologically_relevant_ion("ZN")
+        assert is_biologically_relevant_ion("zn")  # case insensitive
+
+    def test_magnesium_is_relevant(self):
+        """Test that magnesium is detected as relevant."""
+        assert is_biologically_relevant_ion("MG")
+
+    def test_glycerol_is_not_relevant_ion(self):
+        """Test that glycerol is not a relevant ion."""
+        assert not is_biologically_relevant_ion("GOL")
+
+
+class TestRemoveArtifacts:
+    """Tests for remove_artifacts function."""
+
+    def test_removes_glycerol(self):
+        """Test that glycerol atoms are removed."""
+        # fmt: off
+        pdb = (
+            "ATOM      1  N   ALA A   1       0.0   0.0   0.0  1.00  0.00\n"
+            "ATOM      2  CA  ALA A   1       1.5   0.0   0.0  1.00  0.00\n"
+            "HETATM    3  O1  GOL A 100       5.0   5.0   5.0  1.00  0.00\n"
+            "HETATM    4  O2  GOL A 100       6.0   5.0   5.0  1.00  0.00\n"
+            "END\n"
+        )
+        # fmt: on
+        result, removed = remove_artifacts(pdb)
+
+        assert "GOL" not in result
+        assert "GOL" in removed
+        assert removed["GOL"] == 2
+        assert "ATOM" in result  # Protein preserved
+
+    def test_removes_sulfate(self):
+        """Test that sulfate atoms are removed."""
+        # fmt: off
+        pdb = (
+            "ATOM      1  N   ALA A   1       0.0   0.0   0.0  1.00  0.00\n"
+            "HETATM    2  S   SO4 A 200       5.0   5.0   5.0  1.00  0.00\n"
+            "HETATM    3  O1  SO4 A 200       6.0   5.0   5.0  1.00  0.00\n"
+            "HETATM    4  O2  SO4 A 200       7.0   5.0   5.0  1.00  0.00\n"
+            "HETATM    5  O3  SO4 A 200       8.0   5.0   5.0  1.00  0.00\n"
+            "HETATM    6  O4  SO4 A 200       9.0   5.0   5.0  1.00  0.00\n"
+            "END\n"
+        )
+        # fmt: on
+        result, removed = remove_artifacts(pdb)
+
+        assert "SO4" not in result
+        assert removed["SO4"] == 5
+
+    def test_keeps_zinc(self):
+        """Test that zinc ions are preserved."""
+        # fmt: off
+        pdb = (
+            "ATOM      1  N   ALA A   1       0.0   0.0   0.0  1.00  0.00\n"
+            "HETATM    2 ZN   ZN  A 200       5.0   5.0   5.0  1.00  0.00\n"
+            "END\n"
+        )
+        # fmt: on
+        result, removed = remove_artifacts(pdb)
+
+        assert "ZN" in result
+        assert "ZN" not in removed
+
+    def test_keeps_heme(self):
+        """Test that heme is preserved (not an artifact)."""
+        # fmt: off
+        pdb = (
+            "ATOM      1  N   ALA A   1       0.0   0.0   0.0  1.00  0.00\n"
+            "HETATM    2 FE   HEM A 200       5.0   5.0   5.0  1.00  0.00\n"
+            "HETATM    3  NA  HEM A 200       6.0   5.0   5.0  1.00  0.00\n"
+            "END\n"
+        )
+        # fmt: on
+        result, removed = remove_artifacts(pdb)
+
+        assert "HEM" in result
+        assert "HEM" not in removed
+
+    def test_whitelist_preserves_artifact(self):
+        """Test that whitelisted artifacts are preserved."""
+        # fmt: off
+        pdb = (
+            "ATOM      1  N   ALA A   1       0.0   0.0   0.0  1.00  0.00\n"
+            "HETATM    2  O1  GOL A 100       5.0   5.0   5.0  1.00  0.00\n"
+            "END\n"
+        )
+        # fmt: on
+        result, removed = remove_artifacts(pdb, keep_residues={"GOL"})
+
+        assert "GOL" in result
+        assert "GOL" not in removed
+
+    def test_whitelist_case_insensitive(self):
+        """Test that whitelist is case insensitive."""
+        # fmt: off
+        pdb = (
+            "ATOM      1  N   ALA A   1       0.0   0.0   0.0  1.00  0.00\n"
+            "HETATM    2  O1  GOL A 100       5.0   5.0   5.0  1.00  0.00\n"
+            "END\n"
+        )
+        # fmt: on
+        result, removed = remove_artifacts(pdb, keep_residues={"gol"})
+
+        assert "GOL" in result
+
+    def test_multiple_artifact_types(self):
+        """Test removal of multiple artifact types."""
+        # fmt: off
+        pdb = (
+            "ATOM      1  N   ALA A   1       0.0   0.0   0.0  1.00  0.00\n"
+            "HETATM    2  O1  GOL A 100       5.0   5.0   5.0  1.00  0.00\n"
+            "HETATM    3  S   SO4 A 200       6.0   6.0   6.0  1.00  0.00\n"
+            "HETATM    4  CL  CL  A 300       7.0   7.0   7.0  1.00  0.00\n"
+            "END\n"
+        )
+        # fmt: on
+        result, removed = remove_artifacts(pdb)
+
+        assert "GOL" not in result
+        assert "SO4" not in result
+        assert "CL" not in result
+        assert removed["GOL"] == 1
+        assert removed["SO4"] == 1
+        assert removed["CL"] == 1
+
+    def test_empty_pdb(self):
+        """Test handling of empty PDB."""
+        pdb = ""
+        result, removed = remove_artifacts(pdb)
+        assert result == ""
+        assert len(removed) == 0
+
+    def test_protein_only_pdb(self):
+        """Test handling of PDB with no artifacts."""
+        # fmt: off
+        pdb = (
+            "ATOM      1  N   ALA A   1       0.0   0.0   0.0  1.00  0.00\n"
+            "ATOM      2  CA  ALA A   1       1.5   0.0   0.0  1.00  0.00\n"
+            "END\n"
+        )
+        # fmt: on
+        result, removed = remove_artifacts(pdb)
+
+        assert result == pdb
+        assert len(removed) == 0
+
+
+class TestRemoveArtifactsDetergentsAndLipids:
+    """Tests specifically for detergent and lipid removal."""
+
+    def test_removes_palmitic_acid(self):
+        """Test that palmitic acid is removed."""
+        # fmt: off
+        pdb = (
+            "ATOM      1  N   ALA A   1       0.0   0.0   0.0  1.00  0.00\n"
+            "HETATM    2  C1  PLM A 100       5.0   5.0   5.0  1.00  0.00\n"
+            "END\n"
+        )
+        # fmt: on
+        result, removed = remove_artifacts(pdb)
+
+        assert "PLM" not in result
+        assert removed["PLM"] == 1
+
+    def test_removes_octyl_glucoside(self):
+        """Test that octyl glucoside detergent is removed."""
+        # fmt: off
+        pdb = (
+            "ATOM      1  N   ALA A   1       0.0   0.0   0.0  1.00  0.00\n"
+            "HETATM    2  C1  BOG A 100       5.0   5.0   5.0  1.00  0.00\n"
+            "END\n"
+        )
+        # fmt: on
+        result, removed = remove_artifacts(pdb)
+
+        assert "BOG" not in result
+        assert removed["BOG"] == 1
+
+    def test_removes_ldao(self):
+        """Test that LDAO detergent is removed."""
+        # fmt: off
+        pdb = (
+            "ATOM      1  N   ALA A   1       0.0   0.0   0.0  1.00  0.00\n"
+            "HETATM    2  N   LDA A 100       5.0   5.0   5.0  1.00  0.00\n"
+            "END\n"
+        )
+        # fmt: on
+        result, removed = remove_artifacts(pdb)
+
+        assert "LDA" not in result
+        assert removed["LDA"] == 1

From 160ddf7204b47f7308e6d1a62a402ff22c2329a1 Mon Sep 17 00:00:00 2001
From: delalamo <diego.delalamo@gmail.com>
Date: Tue, 13 Jan 2026 10:42:21 -0500
Subject: [PATCH 10/22] Simplify --keep-ligand to accept comma-separated values
 only

- Remove action="append" in favor of single comma-separated string
- Fix parsing logic to handle single string instead of list
- Add unit tests for --keep-ligand CLI argument parsing

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 src/graphrelax/cli.py   | 15 ++++----
 tests/test_artifacts.py | 78 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 86 insertions(+), 7 deletions(-)

diff --git a/src/graphrelax/cli.py b/src/graphrelax/cli.py
index 67ccb45..2dd0469 100644
--- a/src/graphrelax/cli.py
+++ b/src/graphrelax/cli.py
@@ -274,12 +274,10 @@ def create_parser() -> argparse.ArgumentParser:
     preprocess_group.add_argument(
         "--keep-ligand",
         type=str,
-        action="append",
-        metavar="RESNAME",
+        metavar="RESNAMES",
         help=(
-            "Keep a specific ligand residue that would otherwise be stripped "
-            "as an artifact. Can be used multiple times. "
-            "Example: --keep-ligand GOL --keep-ligand SO4"
+            "Keep specific ligand residue(s) that would otherwise be stripped "
+            "as artifacts. Comma-separated. Example: --keep-ligand GOL,SO4"
         ),
     )
     preprocess_group.add_argument(
@@ -433,10 +431,13 @@ def main(args=None) -> int:
         ligand_smiles=ligand_smiles,
     )
 
-    # Build keep_residues set from --keep-ligand flags
+    # Build keep_residues set from --keep-ligand flag (comma-separated)
     keep_residues = set()
     if opts.keep_ligand:
-        keep_residues = {r.upper() for r in opts.keep_ligand}
+        for resname in opts.keep_ligand.split(","):
+            resname = resname.strip().upper()
+            if resname:
+                keep_residues.add(resname)
 
     idealize_config = IdealizeConfig(
         enabled=opts.pre_idealize,
diff --git a/tests/test_artifacts.py b/tests/test_artifacts.py
index ded13bb..3d8184a 100644
--- a/tests/test_artifacts.py
+++ b/tests/test_artifacts.py
@@ -324,3 +324,81 @@ def test_removes_ldao(self):
 
         assert "LDA" not in result
         assert removed["LDA"] == 1
+
+
+class TestCLIKeepLigandParsing:
+    """Tests for CLI --keep-ligand argument parsing."""
+
+    def test_single_ligand(self):
+        """Test parsing a single ligand name."""
+        from graphrelax.cli import create_parser
+
+        parser = create_parser()
+        args = parser.parse_args(
+            ["-i", "in.pdb", "-o", "out.pdb", "--keep-ligand", "GOL"]
+        )
+        assert args.keep_ligand == "GOL"
+
+    def test_comma_separated_ligands(self):
+        """Test parsing comma-separated ligand names."""
+        from graphrelax.cli import create_parser
+
+        parser = create_parser()
+        args = parser.parse_args(
+            ["-i", "in.pdb", "-o", "out.pdb", "--keep-ligand", "GOL,SO4,EDO"]
+        )
+        assert args.keep_ligand == "GOL,SO4,EDO"
+
+        # Test the parsing logic that builds the keep_residues set
+        keep_residues = set()
+        if args.keep_ligand:
+            for resname in args.keep_ligand.split(","):
+                resname = resname.strip().upper()
+                if resname:
+                    keep_residues.add(resname)
+
+        assert keep_residues == {"GOL", "SO4", "EDO"}
+
+    def test_comma_separated_with_spaces(self):
+        """Test parsing comma-separated ligand names with spaces."""
+        from graphrelax.cli import create_parser
+
+        parser = create_parser()
+        args = parser.parse_args(
+            ["-i", "in.pdb", "-o", "out.pdb", "--keep-ligand", "GOL, SO4, EDO"]
+        )
+
+        keep_residues = set()
+        if args.keep_ligand:
+            for resname in args.keep_ligand.split(","):
+                resname = resname.strip().upper()
+                if resname:
+                    keep_residues.add(resname)
+
+        assert keep_residues == {"GOL", "SO4", "EDO"}
+
+    def test_lowercase_normalized_to_uppercase(self):
+        """Test that lowercase ligand names are normalized to uppercase."""
+        from graphrelax.cli import create_parser
+
+        parser = create_parser()
+        args = parser.parse_args(
+            ["-i", "in.pdb", "-o", "out.pdb", "--keep-ligand", "gol,so4"]
+        )
+
+        keep_residues = set()
+        if args.keep_ligand:
+            for resname in args.keep_ligand.split(","):
+                resname = resname.strip().upper()
+                if resname:
+                    keep_residues.add(resname)
+
+        assert keep_residues == {"GOL", "SO4"}
+
+    def test_no_keep_ligand(self):
+        """Test that keep_ligand is None when not provided."""
+        from graphrelax.cli import create_parser
+
+        parser = create_parser()
+        args = parser.parse_args(["-i", "in.pdb", "-o", "out.pdb"])
+        assert args.keep_ligand is None

From 4e7b39c1b204c1e95b929902cb4240e68d043161 Mon Sep 17 00:00:00 2001
From: delalamo <diego.delalamo@gmail.com>
Date: Tue, 13 Jan 2026 10:51:45 -0500
Subject: [PATCH 11/22] Remove redundant code and update README

Code cleanup:
- Consolidate WATER_RESIDUES to artifacts.py (was defined in 3 places)
- Create shared check_gpu_available() in utils.py (was duplicated)
- Remove unused add_ter_records_at_gaps() from chain_gaps.py
- Remove unused _relax_direct() method from relaxer.py (~100 lines)
- Remove corresponding test class TestAddTerRecordsAtGaps

README update:
- Update --keep-ligand documentation to show comma-separated syntax

Total: ~180 lines of redundant code removed.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 README.md                      |   9 ++-
 src/graphrelax/chain_gaps.py   |  57 --------------
 src/graphrelax/idealize.py     |  11 +--
 src/graphrelax/ligand_utils.py |   7 +-
 src/graphrelax/relaxer.py      | 134 ++-------------------------------
 src/graphrelax/utils.py        |  39 ++++++++--
 tests/test_chain_gaps.py       |  22 ------
 7 files changed, 51 insertions(+), 228 deletions(-)

diff --git a/README.md b/README.md
index 83c1d43..4d4e325 100644
--- a/README.md
+++ b/README.md
@@ -193,8 +193,8 @@ graphrelax -i crystal_structure.pdb -o relaxed.pdb
 # Keep all HETATM records including artifacts
 graphrelax -i crystal_structure.pdb -o relaxed.pdb --keep-all-ligands
 
-# Keep specific artifacts that you need
-graphrelax -i crystal_structure.pdb -o relaxed.pdb --keep-ligand GOL --keep-ligand SO4
+# Keep specific artifacts that you need (comma-separated)
+graphrelax -i crystal_structure.pdb -o relaxed.pdb --keep-ligand GOL,SO4
 ```
 
 ### Working with Ligands
@@ -349,8 +349,9 @@ Input preprocessing:
   --keep-waters         Keep water molecules in input (default: removed)
   --keep-all-ligands    Keep all HETATM records including crystallography
                         artifacts. By default, common artifacts are removed.
-  --keep-ligand RESNAME Keep specific ligand residue (can be used multiple
-                        times). Example: --keep-ligand GOL --keep-ligand SO4
+  --keep-ligand RES1,RES2,...
+                        Keep specific ligand residues (comma-separated).
+                        Example: --keep-ligand GOL,SO4
   --pre-idealize        Idealize backbone geometry before processing.
                         Corrects bond lengths/angles while preserving
                         dihedral angles. By default, chain breaks are closed.
diff --git a/src/graphrelax/chain_gaps.py b/src/graphrelax/chain_gaps.py
index e7404b7..a4987ee 100644
--- a/src/graphrelax/chain_gaps.py
+++ b/src/graphrelax/chain_gaps.py
@@ -278,63 +278,6 @@ def restore_chain_ids(
     return "\n".join(output_lines)
 
 
-def add_ter_records_at_gaps(pdb_string: str, gaps: List[ChainGap]) -> str:
-    """
-    Add TER records at gap locations to signal chain breaks.
-
-    This is an alternative to chain splitting that works with force fields
-    that recognize TER as chain termination.
-
-    Args:
-        pdb_string: PDB file contents as string
-        gaps: List of detected gaps
-
-    Returns:
-        PDB string with TER records inserted at gap locations
-    """
-    if not gaps:
-        return pdb_string
-
-    # Build set of (chain_id, resnum, icode) where TER should be inserted
-    # TER goes after residue_before
-    ter_locations = {
-        (g.chain_id, g.residue_before, g.icode_before) for g in gaps
-    }
-
-    output_lines = []
-    prev_chain = None
-    prev_resnum = None
-    prev_icode = None
-
-    for line in pdb_string.split("\n"):
-        if line.startswith(("ATOM", "HETATM")) and len(line) > 26:
-            chain_id = line[21]
-            resnum = int(line[22:26].strip())
-            icode = line[26].strip() if len(line) > 26 else ""
-
-            # Check if we need to insert TER before this line
-            # (i.e., previous residue was at a gap location)
-            if (
-                prev_chain is not None
-                and (prev_chain, prev_resnum, prev_icode) in ter_locations
-                and (chain_id != prev_chain or resnum != prev_resnum)
-            ):
-                # Insert TER record
-                ter_line = "TER   "
-                output_lines.append(ter_line)
-                logger.debug(
-                    f"Inserted TER after {prev_chain}:{prev_resnum}{prev_icode}"
-                )
-
-            prev_chain = chain_id
-            prev_resnum = resnum
-            prev_icode = icode
-
-        output_lines.append(line)
-
-    return "\n".join(output_lines)
-
-
 def get_gap_summary(gaps: List[ChainGap]) -> str:
     """
     Generate a human-readable summary of detected gaps.
diff --git a/src/graphrelax/idealize.py b/src/graphrelax/idealize.py
index 49e1236..5c814e9 100644
--- a/src/graphrelax/idealize.py
+++ b/src/graphrelax/idealize.py
@@ -20,6 +20,7 @@
 from openmm import openmm, unit
 from pdbfixer import PDBFixer
 
+from graphrelax.artifacts import WATER_RESIDUES
 from graphrelax.chain_gaps import (
     ChainGap,
     detect_chain_gaps,
@@ -27,6 +28,7 @@
     split_chains_at_gaps,
 )
 from graphrelax.config import IdealizeConfig
+from graphrelax.utils import check_gpu_available
 
 # Add vendored LigandMPNN to path for OpenFold imports
 LIGANDMPNN_PATH = Path(__file__).parent / "LigandMPNN"
@@ -37,9 +39,6 @@
 
 logger = logging.getLogger(__name__)
 
-# Water residue names to preserve with protein
-WATER_RESIDUES = {"HOH", "WAT", "SOL", "TIP3", "TIP4", "SPC"}
-
 
 @dataclass
 class DihedralAngles:
@@ -482,11 +481,7 @@ def minimize_with_constraints(
     )
 
     # Check for GPU
-    use_gpu = False
-    for i in range(Platform.getNumPlatforms()):
-        if Platform.getPlatform(i).getName() == "CUDA":
-            use_gpu = True
-            break
+    use_gpu = check_gpu_available()
 
     # Create simulation
     integrator = openmm.LangevinIntegrator(0, 0.01, 0.0)
diff --git a/src/graphrelax/ligand_utils.py b/src/graphrelax/ligand_utils.py
index 6c13ef8..c774e07 100644
--- a/src/graphrelax/ligand_utils.py
+++ b/src/graphrelax/ligand_utils.py
@@ -23,6 +23,9 @@
 
 logger = logging.getLogger(__name__)
 
+# Import WATER_RESIDUES from artifacts to avoid duplication
+from graphrelax.artifacts import WATER_RESIDUES  # noqa: E402
+
 
 def _check_ligand_deps():
     """Raise ImportError if ligand dependencies are not available."""
@@ -36,10 +39,6 @@ def _check_ligand_deps():
         )
 
 
-# Water residue names to exclude from ligand processing
-WATER_RESIDUES = {"HOH", "WAT", "SOL", "TIP3", "TIP4", "SPC"}
-
-
 @dataclass
 class LigandInfo:
     """Information about a ligand extracted from PDB."""
diff --git a/src/graphrelax/relaxer.py b/src/graphrelax/relaxer.py
index 61e613b..faa1d44 100644
--- a/src/graphrelax/relaxer.py
+++ b/src/graphrelax/relaxer.py
@@ -4,14 +4,14 @@
 import logging
 import sys
 from pathlib import Path
-from typing import Optional, Tuple
+from typing import Tuple
 
 import numpy as np
-from openmm import Platform
 from openmm import app as openmm_app
 from openmm import openmm, unit
 from pdbfixer import PDBFixer
 
+from graphrelax.artifacts import WATER_RESIDUES
 from graphrelax.chain_gaps import (
     detect_chain_gaps,
     get_gap_summary,
@@ -21,11 +21,11 @@
 from graphrelax.config import RelaxConfig
 from graphrelax.idealize import extract_ligands, restore_ligands
 from graphrelax.ligand_utils import (
-    WATER_RESIDUES,
     create_openff_molecule,
     extract_ligands_from_pdb,
     ligand_pdb_to_topology,
 )
+from graphrelax.utils import check_gpu_available
 
 # openmmforcefields is only available via conda-forge
 # Install with: conda install -c conda-forge openmmforcefields=0.13.0
@@ -66,22 +66,6 @@ class Relaxer:
 
     def __init__(self, config: RelaxConfig):
         self.config = config
-        self._use_gpu: Optional[bool] = None
-
-    def _check_gpu_available(self) -> bool:
-        """Check if CUDA is available for OpenMM."""
-        if self._use_gpu is not None:
-            return self._use_gpu
-
-        for i in range(Platform.getNumPlatforms()):
-            if Platform.getPlatform(i).getName() == "CUDA":
-                self._use_gpu = True
-                logger.info("OpenMM CUDA platform detected, using GPU")
-                return True
-
-        self._use_gpu = False
-        logger.info("OpenMM CUDA not available, using CPU")
-        return False
 
     def relax(self, pdb_string: str) -> Tuple[str, dict, np.ndarray]:
         """
@@ -165,7 +149,7 @@ def relax_protein(self, prot) -> Tuple[str, dict, np.ndarray]:
         Returns:
             Tuple of (relaxed_pdb_string, debug_info, violations)
         """
-        use_gpu = self._check_gpu_available()
+        use_gpu = check_gpu_available()
 
         relaxer = AmberRelaxation(
             max_iterations=self.config.max_iterations,
@@ -223,7 +207,7 @@ def _relax_unconstrained(
         ENERGY = unit.kilocalories_per_mole
         LENGTH = unit.angstroms
 
-        use_gpu = self._check_gpu_available()
+        use_gpu = check_gpu_available()
 
         logger.info(
             f"Running unconstrained OpenMM minimization "
@@ -329,7 +313,7 @@ def _relax_unconstrained_with_ligands(
         ENERGY = unit.kilocalories_per_mole
         LENGTH = unit.angstroms
 
-        use_gpu = self._check_gpu_available()
+        use_gpu = check_gpu_available()
 
         logger.info(
             f"Running unconstrained minimization with ligands "
@@ -450,110 +434,6 @@ def _relax_unconstrained_with_ligands(
         violations = np.zeros(0)
         return relaxed_pdb, debug_data, violations
 
-    def _relax_direct(self, pdb_string: str) -> Tuple[str, dict, np.ndarray]:
-        """
-        Direct OpenMM minimization without pdbfixer.
-
-        This is a simpler approach that works for already-complete structures
-        (like those from LigandMPNN with packed side chains).
-
-        Args:
-            pdb_string: PDB file contents as string
-
-        Returns:
-            Tuple of (relaxed_pdb_string, debug_info, violations)
-        """
-        ENERGY = unit.kilocalories_per_mole
-        LENGTH = unit.angstroms
-
-        use_gpu = self._check_gpu_available()
-
-        logger.info(
-            f"Running direct OpenMM minimization "
-            f"(max_iter={self.config.max_iterations}, "
-            f"stiffness={self.config.stiffness}, gpu={use_gpu})"
-        )
-
-        # Parse PDB
-        pdb_file = io.StringIO(pdb_string)
-        pdb = openmm_app.PDBFile(pdb_file)
-
-        # Create force field and system
-        force_field = openmm_app.ForceField(
-            "amber14-all.xml", "amber14/tip3pfb.xml"
-        )
-
-        # Use Modeller to add hydrogens (doesn't require pdbfixer)
-        modeller = openmm_app.Modeller(pdb.topology, pdb.positions)
-        modeller.addHydrogens(force_field)
-
-        # Create system with constraints on hydrogen bonds
-        system = force_field.createSystem(
-            modeller.topology, constraints=openmm_app.HBonds
-        )
-
-        # Add position restraints if stiffness > 0
-        if self.config.stiffness > 0:
-            self._add_restraints(
-                system, modeller, self.config.stiffness * ENERGY / (LENGTH**2)
-            )
-
-        # Create integrator and simulation
-        integrator = openmm.LangevinIntegrator(0, 0.01, 0.0)
-        platform = openmm.Platform.getPlatformByName(
-            "CUDA" if use_gpu else "CPU"
-        )
-        simulation = openmm_app.Simulation(
-            modeller.topology, system, integrator, platform
-        )
-        simulation.context.setPositions(modeller.positions)
-
-        # Get initial energy
-        state = simulation.context.getState(getEnergy=True, getPositions=True)
-        einit = state.getPotentialEnergy().value_in_unit(ENERGY)
-        posinit = state.getPositions(asNumpy=True).value_in_unit(LENGTH)
-
-        # Minimize
-        # OpenMM minimizeEnergy tolerance is in kJ/mol/nm (gradient threshold)
-        tolerance = (
-            self.config.tolerance * unit.kilojoules_per_mole / unit.nanometer
-        )
-        simulation.minimizeEnergy(
-            maxIterations=self.config.max_iterations, tolerance=tolerance
-        )
-
-        # Get final state
-        state = simulation.context.getState(getEnergy=True, getPositions=True)
-        efinal = state.getPotentialEnergy().value_in_unit(ENERGY)
-        pos = state.getPositions(asNumpy=True).value_in_unit(LENGTH)
-
-        # Calculate RMSD
-        rmsd = np.sqrt(np.sum((posinit - pos) ** 2) / len(posinit))
-
-        # Write output PDB
-        output = io.StringIO()
-        openmm_app.PDBFile.writeFile(
-            simulation.topology, state.getPositions(), output
-        )
-        relaxed_pdb = output.getvalue()
-
-        debug_data = {
-            "initial_energy": einit,
-            "final_energy": efinal,
-            "rmsd": rmsd,
-            "attempts": 1,
-        }
-
-        logger.info(
-            f"Relaxation complete: E_init={einit:.2f}, "
-            f"E_final={efinal:.2f}, RMSD={rmsd:.3f} A"
-        )
-
-        # No violations tracking in direct mode
-        violations = np.zeros(0)
-
-        return relaxed_pdb, debug_data, violations
-
     def _add_restraints(self, system, modeller, stiffness):
         """Add harmonic position restraints to heavy atoms."""
         force = openmm.CustomExternalForce(
@@ -603,7 +483,7 @@ def get_energy_breakdown(self, pdb_string: str) -> dict:
             )
 
             # Create simulation
-            use_gpu = self._check_gpu_available()
+            use_gpu = check_gpu_available()
             platform = openmm.Platform.getPlatformByName(
                 "CUDA" if use_gpu else "CPU"
             )
diff --git a/src/graphrelax/utils.py b/src/graphrelax/utils.py
index 9e2ae91..0eec674 100644
--- a/src/graphrelax/utils.py
+++ b/src/graphrelax/utils.py
@@ -5,10 +5,40 @@
 from pathlib import Path
 from typing import Optional
 
+from graphrelax.artifacts import WATER_RESIDUES
 from graphrelax.structure_io import StructureFormat
 
 logger = logging.getLogger(__name__)
 
+# Cache for GPU availability check
+_gpu_available = None
+
+
+def check_gpu_available() -> bool:
+    """
+    Check if CUDA is available for OpenMM.
+
+    Results are cached after first check.
+
+    Returns:
+        True if CUDA platform is available
+    """
+    global _gpu_available
+    if _gpu_available is not None:
+        return _gpu_available
+
+    from openmm import Platform
+
+    for i in range(Platform.getNumPlatforms()):
+        if Platform.getPlatform(i).getName() == "CUDA":
+            _gpu_available = True
+            logger.info("OpenMM CUDA platform detected, using GPU")
+            return True
+
+    _gpu_available = False
+    logger.info("OpenMM CUDA not available, using CPU")
+    return False
+
 
 def remove_waters(structure_string: str, fmt: StructureFormat = None) -> str:
     """
@@ -39,7 +69,6 @@ def _remove_waters_pdb(pdb_string: str) -> str:
     Returns:
         PDB string with water molecules removed
     """
-    water_residues = {"HOH", "WAT", "SOL", "TIP3", "TIP4", "SPC"}
     filtered_lines = []
 
     for line in pdb_string.splitlines():
@@ -48,13 +77,13 @@ def _remove_waters_pdb(pdb_string: str) -> str:
             # Residue name is in columns 17-20 (0-indexed: 17:20)
             if len(line) >= 20:
                 resname = line[17:20].strip()
-                if resname in water_residues:
+                if resname in WATER_RESIDUES:
                     continue
         # Check TER records that might reference water
         elif line.startswith("TER"):
             if len(line) >= 20:
                 resname = line[17:20].strip()
-                if resname in water_residues:
+                if resname in WATER_RESIDUES:
                     continue
 
         filtered_lines.append(line)
@@ -77,11 +106,9 @@ def _remove_waters_cif(cif_string: str) -> str:
 
     from Bio.PDB import MMCIFIO, MMCIFParser, Select
 
-    water_residues = {"HOH", "WAT", "SOL", "TIP3", "TIP4", "SPC"}
-
     class WaterRemover(Select):
         def accept_residue(self, residue):
-            return residue.get_resname().strip() not in water_residues
+            return residue.get_resname().strip() not in WATER_RESIDUES
 
     # MMCIFParser requires a file path
     with tempfile.NamedTemporaryFile(
diff --git a/tests/test_chain_gaps.py b/tests/test_chain_gaps.py
index 53b29d7..c5119a2 100644
--- a/tests/test_chain_gaps.py
+++ b/tests/test_chain_gaps.py
@@ -4,7 +4,6 @@
 
 from graphrelax.chain_gaps import (
     ChainGap,
-    add_ter_records_at_gaps,
     detect_chain_gaps,
     get_gap_summary,
     restore_chain_ids,
@@ -302,27 +301,6 @@ def test_roundtrip_preserves_atoms(self, gapped_peptide_pdb):
         assert orig_atoms == restored_atoms
 
 
-class TestAddTerRecordsAtGaps:
-    """Tests for add_ter_records_at_gaps function."""
-
-    def test_no_ter_without_gaps(self, continuous_peptide_pdb):
-        """Test no TER records added to continuous peptide."""
-        gaps = detect_chain_gaps(continuous_peptide_pdb, check_distance=False)
-        result = add_ter_records_at_gaps(continuous_peptide_pdb, gaps)
-        assert result == continuous_peptide_pdb
-
-    def test_adds_ter_at_gap(self, gapped_peptide_pdb):
-        """Test TER record is added at gap location."""
-        gaps = detect_chain_gaps(gapped_peptide_pdb, check_distance=False)
-        result = add_ter_records_at_gaps(gapped_peptide_pdb, gaps)
-
-        # Count TER records
-        ter_count = sum(
-            1 for line in result.split("\n") if line.startswith("TER")
-        )
-        assert ter_count >= 1
-
-
 class TestGetGapSummary:
     """Tests for get_gap_summary function."""
 

From 09e50939d97af917fbd60d1a97b1dfba7345a206 Mon Sep 17 00:00:00 2001
From: delalamo <diego.delalamo@gmail.com>
Date: Tue, 13 Jan 2026 11:06:37 -0500
Subject: [PATCH 12/22] Remove requirements.txt and fix integration tests

- Remove requirements.txt (dependencies in pyproject.toml)
- Update test_relaxer_integration.py to use public API:
  - Use check_gpu_available() from utils instead of removed method
  - Use relax() with constrained=False instead of removed _relax_direct()

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 requirements.txt                  | 49 ----------------
 tests/test_relaxer_integration.py | 92 ++++++++++++++++++-------------
 2 files changed, 55 insertions(+), 86 deletions(-)
 delete mode 100644 requirements.txt

diff --git a/requirements.txt b/requirements.txt
deleted file mode 100644
index 3c80844..0000000
--- a/requirements.txt
+++ /dev/null
@@ -1,49 +0,0 @@
-# GraphRelax Requirements
-# ======================
-#
-# This file lists ALL dependencies needed to run GraphRelax with full
-# functionality, including ligand support.
-#
-# INSTALLATION ORDER:
-# 1. First install conda-forge packages (not available on PyPI)
-# 2. Then install pip packages
-#
-# Quick install (recommended):
-#   conda install -c conda-forge pdbfixer openmmforcefields openff-toolkit rdkit
-#   pip install -r requirements.txt
-#
-# Or use the environment.yml file for a complete conda environment.
-
-# =============================================================================
-# CONDA-FORGE ONLY (must be installed via conda before pip)
-# =============================================================================
-# These packages are NOT on PyPI and must be installed via conda-forge:
-#
-#   conda install -c conda-forge \
-#       pdbfixer \
-#       openmmforcefields>=0.13.0 \
-#       openff-toolkit>=0.14.0 \
-#       rdkit>=2023.09.1
-#
-# pdbfixer: Required for unconstrained minimization and ligand support
-# openmmforcefields: Required for --include-ligands (ligand force fields)
-# openff-toolkit: Required for --include-ligands (molecule parameterization)
-# rdkit: Required for --include-ligands (bond perception from PDB)
-
-# =============================================================================
-# PIP PACKAGES (install after conda packages)
-# =============================================================================
-
-# Core dependencies
-torch>=2.0
-numpy<2  # PyTorch <2.5 is incompatible with NumPy 2.x
-openmm
-biopython
-prody
-absl-py
-ml-collections
-dm-tree
-
-# Development/testing (optional)
-# pytest>=7.0
-# pytest-cov>=4.1
diff --git a/tests/test_relaxer_integration.py b/tests/test_relaxer_integration.py
index d62d92d..fb37596 100644
--- a/tests/test_relaxer_integration.py
+++ b/tests/test_relaxer_integration.py
@@ -20,29 +20,44 @@ def relaxer():
     return Relaxer(config)
 
 
+@pytest.fixture
+def unconstrained_relaxer():
+    """Create a Relaxer instance with unconstrained minimization."""
+    from graphrelax.relaxer import Relaxer
+
+    config = RelaxConfig(max_iterations=50, stiffness=10.0, constrained=False)
+    return Relaxer(config)
+
+
 @pytest.mark.integration
 class TestRelaxerGPUDetection:
     """Tests for GPU detection logic."""
 
-    def test_check_gpu_available_returns_bool(self, relaxer):
+    def test_check_gpu_available_returns_bool(self):
         """Test that GPU check returns a boolean."""
-        result = relaxer._check_gpu_available()
+        from graphrelax.utils import check_gpu_available
+
+        result = check_gpu_available()
         assert isinstance(result, bool)
 
-    def test_check_gpu_available_cached(self, relaxer):
+    def test_check_gpu_available_cached(self):
         """Test that GPU check result is cached."""
-        result1 = relaxer._check_gpu_available()
-        result2 = relaxer._check_gpu_available()
+        from graphrelax.utils import check_gpu_available
+
+        result1 = check_gpu_available()
+        result2 = check_gpu_available()
         assert result1 == result2
 
 
 @pytest.mark.integration
-class TestRelaxDirect:
-    """Tests for direct OpenMM minimization."""
+class TestRelaxUnconstrained:
+    """Tests for unconstrained OpenMM minimization."""
 
-    def test_relax_direct_runs(self, relaxer, small_peptide_pdb_string):
-        """Test that direct relaxation completes without error."""
-        relaxed_pdb, debug_info, violations = relaxer._relax_direct(
+    def test_relax_unconstrained_runs(
+        self, unconstrained_relaxer, small_peptide_pdb_string
+    ):
+        """Test that unconstrained relaxation completes without error."""
+        relaxed_pdb, debug_info, violations = unconstrained_relaxer.relax(
             small_peptide_pdb_string
         )
 
@@ -50,39 +65,44 @@ def test_relax_direct_runs(self, relaxer, small_peptide_pdb_string):
         assert isinstance(relaxed_pdb, str)
         assert len(relaxed_pdb) > 0
 
-    def test_relax_direct_returns_debug_info(
-        self, relaxer, small_peptide_pdb_string
+    def test_relax_unconstrained_returns_debug_info(
+        self, unconstrained_relaxer, small_peptide_pdb_string
     ):
         """Test that debug info contains expected keys."""
-        _, debug_info, _ = relaxer._relax_direct(small_peptide_pdb_string)
+        _, debug_info, _ = unconstrained_relaxer.relax(small_peptide_pdb_string)
 
         assert "initial_energy" in debug_info
         assert "final_energy" in debug_info
         assert "rmsd" in debug_info
-        assert "attempts" in debug_info
 
-    def test_relax_direct_energy_types(self, relaxer, small_peptide_pdb_string):
+    def test_relax_unconstrained_energy_types(
+        self, unconstrained_relaxer, small_peptide_pdb_string
+    ):
         """Test that energy values are numeric."""
-        _, debug_info, _ = relaxer._relax_direct(small_peptide_pdb_string)
+        _, debug_info, _ = unconstrained_relaxer.relax(small_peptide_pdb_string)
 
         assert isinstance(debug_info["initial_energy"], (int, float))
         assert isinstance(debug_info["final_energy"], (int, float))
         assert isinstance(debug_info["rmsd"], (int, float))
 
-    def test_relax_direct_pdb_format(self, relaxer, small_peptide_pdb_string):
+    def test_relax_unconstrained_pdb_format(
+        self, unconstrained_relaxer, small_peptide_pdb_string
+    ):
         """Test that output is valid PDB format."""
-        relaxed_pdb, _, _ = relaxer._relax_direct(small_peptide_pdb_string)
+        relaxed_pdb, _, _ = unconstrained_relaxer.relax(
+            small_peptide_pdb_string
+        )
 
         # Should contain ATOM records
         assert "ATOM" in relaxed_pdb or "HETATM" in relaxed_pdb
 
-    def test_relax_direct_violations_array(
-        self, relaxer, small_peptide_pdb_string
+    def test_relax_unconstrained_violations_array(
+        self, unconstrained_relaxer, small_peptide_pdb_string
     ):
         """Test that violations is a numpy array."""
         import numpy as np
 
-        _, _, violations = relaxer._relax_direct(small_peptide_pdb_string)
+        _, _, violations = unconstrained_relaxer.relax(small_peptide_pdb_string)
 
         assert isinstance(violations, np.ndarray)
 
@@ -135,30 +155,28 @@ def test_get_energy_breakdown_has_total(
 class TestRelaxerConfig:
     """Tests for Relaxer with different configurations."""
 
-    def test_high_stiffness(self, small_peptide_pdb_string):
-        """Test relaxation with high stiffness (more restrained)."""
+    def test_high_stiffness_constrained(self, small_peptide_pdb_string):
+        """Test constrained relaxation with high stiffness (more restrained)."""
         from graphrelax.relaxer import Relaxer
 
-        config = RelaxConfig(max_iterations=50, stiffness=100.0)
+        config = RelaxConfig(
+            max_iterations=50, stiffness=100.0, constrained=True
+        )
         relaxer = Relaxer(config)
 
-        relaxed_pdb, debug_info, _ = relaxer._relax_direct(
-            small_peptide_pdb_string
-        )
+        relaxed_pdb, debug_info, _ = relaxer.relax(small_peptide_pdb_string)
 
         # High stiffness should result in small RMSD
         assert debug_info["rmsd"] < 1.0  # Less than 1 Angstrom
 
-    def test_zero_stiffness(self, small_peptide_pdb_string):
-        """Test relaxation with no restraints."""
+    def test_unconstrained_minimization(self, small_peptide_pdb_string):
+        """Test unconstrained minimization (no restraints)."""
         from graphrelax.relaxer import Relaxer
 
-        config = RelaxConfig(max_iterations=50, stiffness=0.0)
+        config = RelaxConfig(max_iterations=50, constrained=False)
         relaxer = Relaxer(config)
 
-        relaxed_pdb, debug_info, _ = relaxer._relax_direct(
-            small_peptide_pdb_string
-        )
+        relaxed_pdb, debug_info, _ = relaxer.relax(small_peptide_pdb_string)
 
         assert relaxed_pdb is not None
 
@@ -166,11 +184,11 @@ def test_limited_iterations(self, small_peptide_pdb_string):
         """Test relaxation with limited iterations."""
         from graphrelax.relaxer import Relaxer
 
-        config = RelaxConfig(max_iterations=10, stiffness=10.0)
+        config = RelaxConfig(
+            max_iterations=10, stiffness=10.0, constrained=False
+        )
         relaxer = Relaxer(config)
 
-        relaxed_pdb, debug_info, _ = relaxer._relax_direct(
-            small_peptide_pdb_string
-        )
+        relaxed_pdb, debug_info, _ = relaxer.relax(small_peptide_pdb_string)
 
         assert relaxed_pdb is not None

From 916b4c15e6cd4d7282a59258937c5e8b396f77e0 Mon Sep 17 00:00:00 2001
From: delalamo <diego.delalamo@gmail.com>
Date: Tue, 13 Jan 2026 11:09:24 -0500
Subject: [PATCH 13/22] Clarify conda-forge dependencies in README

- Make it clear that openmmforcefields, openff-toolkit, and rdkit
  are conda-forge only (like pdbfixer)
- Add ligand support installation command to PyPI section
- Reorganize dependencies table to show required vs optional

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 README.md | 29 +++++++++++++++++++----------
 1 file changed, 19 insertions(+), 10 deletions(-)

diff --git a/README.md b/README.md
index 4d4e325..70b3838 100644
--- a/README.md
+++ b/README.md
@@ -11,14 +11,20 @@ GraphRelax requires pdbfixer, which is only available via conda-forge. We recomm
 ### From PyPI (Latest Release)
 
 ```bash
-# First, install pdbfixer via mamba/conda (required)
+# First, install conda-forge dependencies (not available on PyPI)
 mamba install -c conda-forge pdbfixer
 
 # Then install graphrelax from PyPI
 pip install graphrelax
 ```
 
-This installs the latest stable release.
+This installs the latest stable release with basic functionality.
+
+**For ligand support** (auto-parameterization of small molecules), also install:
+
+```bash
+mamba install -c conda-forge openmmforcefields openff-toolkit rdkit
+```
 
 ### From Source (Latest Development Version)
 
@@ -60,17 +66,20 @@ Core dependencies (installed automatically via pip):
 - absl-py
 - ml-collections
 
-Required (must be installed separately via conda):
+Conda-forge only (must be installed via mamba/conda, not pip):
 
-- pdbfixer (conda-forge only, not on PyPI)
+| Package           | Required | Purpose                          |
+| ----------------- | -------- | -------------------------------- |
+| pdbfixer          | Yes      | Structure preparation            |
+| openmmforcefields | Optional | Small molecule force fields      |
+| openff-toolkit    | Optional | OpenFF Molecule parameterization |
+| rdkit             | Optional | Bond perception from PDB         |
 
-Optional (for ligand support):
+The optional packages are needed for **ligand support** (auto-parameterization of small molecules during minimization). Install all conda-forge dependencies with:
 
-| Package           | Version      | Purpose                     |
-| ----------------- | ------------ | --------------------------- |
-| openmmforcefields | >= 0.13.0    | Small molecule force fields |
-| openff-toolkit    | >= 0.14.0    | OpenFF Molecule creation    |
-| rdkit             | >= 2023.09.1 | Bond perception from PDB    |
+```bash
+mamba install -c conda-forge pdbfixer openmmforcefields openff-toolkit rdkit
+```
 
 ### Docker
 

From ba18f8e8038daa353cea44ac9f91e4da3fc482df Mon Sep 17 00:00:00 2001
From: delalamo <diego.delalamo@gmail.com>
Date: Tue, 13 Jan 2026 11:10:43 -0500
Subject: [PATCH 14/22] Make all conda-forge dependencies mandatory

- pdbfixer, openmmforcefields, openff-toolkit, and rdkit are now all
  required for installation
- Simplified installation instructions
- Removed "optional" labeling from dependencies table

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 README.md | 33 +++++++++------------------------
 1 file changed, 9 insertions(+), 24 deletions(-)

diff --git a/README.md b/README.md
index 70b3838..a386636 100644
--- a/README.md
+++ b/README.md
@@ -6,32 +6,21 @@ GraphRelax combines **LigandMPNN** (for sequence design and side-chain packing)
 
 ## Installation
 
-GraphRelax requires pdbfixer, which is only available via conda-forge. We recommend using mamba for faster installation.
+GraphRelax requires several packages that are only available via conda-forge. We recommend using mamba for faster installation.
 
 ### From PyPI (Latest Release)
 
 ```bash
 # First, install conda-forge dependencies (not available on PyPI)
-mamba install -c conda-forge pdbfixer
+mamba install -c conda-forge pdbfixer openmmforcefields openff-toolkit rdkit
 
 # Then install graphrelax from PyPI
 pip install graphrelax
 ```
 
-This installs the latest stable release with basic functionality.
-
-**For ligand support** (auto-parameterization of small molecules), also install:
-
-```bash
-mamba install -c conda-forge openmmforcefields openff-toolkit rdkit
-```
-
 ### From Source (Latest Development Version)
 
 ```bash
-# First, install pdbfixer via mamba/conda (required)
-mamba install -c conda-forge pdbfixer
-
 # Clone the repository
 git clone https://github.com/delalamo/GraphRelax.git
 cd GraphRelax
@@ -46,8 +35,6 @@ mamba install -c conda-forge pdbfixer openmmforcefields openff-toolkit rdkit
 pip install -e .
 ```
 
-This installs the latest development version with all recent changes.
-
 LigandMPNN model weights (~40MB) are downloaded automatically on first run.
 
 > **Note:** `mamba` is a drop-in replacement for `conda` that's 10-100x faster. Install it with `conda install -n base -c conda-forge mamba`, or use `micromamba` as a standalone tool.
@@ -68,14 +55,14 @@ Core dependencies (installed automatically via pip):
 
 Conda-forge only (must be installed via mamba/conda, not pip):
 
-| Package           | Required | Purpose                          |
-| ----------------- | -------- | -------------------------------- |
-| pdbfixer          | Yes      | Structure preparation            |
-| openmmforcefields | Optional | Small molecule force fields      |
-| openff-toolkit    | Optional | OpenFF Molecule parameterization |
-| rdkit             | Optional | Bond perception from PDB         |
+| Package           | Purpose                          |
+| ----------------- | -------------------------------- |
+| pdbfixer          | Structure preparation            |
+| openmmforcefields | Small molecule force fields      |
+| openff-toolkit    | OpenFF Molecule parameterization |
+| rdkit             | Bond perception from PDB         |
 
-The optional packages are needed for **ligand support** (auto-parameterization of small molecules during minimization). Install all conda-forge dependencies with:
+Install all conda-forge dependencies with:
 
 ```bash
 mamba install -c conda-forge pdbfixer openmmforcefields openff-toolkit rdkit
@@ -226,8 +213,6 @@ graphrelax -i protein_with_ligand.pdb -o designed.pdb \
 graphrelax -i protein_with_ligand.pdb -o designed.pdb --ignore-ligands
 ```
 
-**Requires:** `mamba install -c conda-forge openmmforcefields openff-toolkit rdkit`
-
 For design around ligands, use `ligand_mpnn` model type:
 
 ```bash

From 0d99579a2625bd7cec1334e7047ec6baf40fd0c8 Mon Sep 17 00:00:00 2001
From: delalamo <diego.delalamo@gmail.com>
Date: Tue, 13 Jan 2026 12:45:26 -0500
Subject: [PATCH 15/22] Enable idealization by default and add missing residue
 control

- Change idealization from opt-in (--pre-idealize) to opt-out (--no-idealize)
- IdealizeConfig.enabled now defaults to True
- Add --ignore-missing-residues flag to skip adding residues from SEQRES
- Add --overwrite flag to allow overwriting output files
- Preserve residue numbering with keepIds=True in PDB output
- Update README to reflect new default behavior

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 README.md                                     | 45 ++++++-----
 .../LigandMPNN/openfold/np/relax/cleanup.py   |  3 +
 src/graphrelax/cli.py                         | 79 ++++++++++++++++---
 src/graphrelax/config.py                      |  3 +-
 src/graphrelax/idealize.py                    |  2 +-
 src/graphrelax/relaxer.py                     | 18 ++++-
 6 files changed, 116 insertions(+), 34 deletions(-)

diff --git a/README.md b/README.md
index a386636..38e4c24 100644
--- a/README.md
+++ b/README.md
@@ -239,31 +239,36 @@ graphrelax -i protein_with_ligand.pdb -o designed.pdb \
 
 **Requires:** `mamba install -c conda-forge pdbfixer`
 
-### Pre-Idealization
+### Backbone Idealization
 
-GraphRelax can optionally idealize backbone geometry before processing. This is useful for structures with distorted bond lengths or angles (e.g., from homology modeling or low-resolution experimental data). The idealization step:
+By default, GraphRelax idealizes backbone geometry before processing. This corrects distorted bond lengths and angles commonly found in experimental structures or homology models. The idealization step:
 
 1. Corrects backbone bond lengths and angles to ideal values
 2. Preserves phi/psi/omega dihedral angles
-3. Adds missing atoms and optionally missing residues from SEQRES
+3. Adds missing atoms
 4. Runs constrained minimization to relieve local strain
 5. By default, closes chain breaks (gaps) in the structure
 
+**Note:** By default, GraphRelax adds missing residues from SEQRES records during both relaxation and idealization. This may change residue numbering if the input PDB is missing N/C-terminal residues. Use `--ignore-missing-residues` to preserve original numbering for resfile compatibility.
+
 ```bash
-# Idealize before relaxation
-graphrelax -i input.pdb -o relaxed.pdb --pre-idealize
+# Default: idealization is enabled
+graphrelax -i input.pdb -o relaxed.pdb
 
-# Idealize but don't add missing residues from SEQRES
-graphrelax -i input.pdb -o relaxed.pdb --pre-idealize --ignore-missing-residues
+# Skip idealization (use input geometry as-is)
+graphrelax -i input.pdb -o relaxed.pdb --no-idealize
 
-# Idealize but keep chain breaks as separate chains (don't close gaps)
-graphrelax -i input.pdb -o relaxed.pdb --pre-idealize --retain-chainbreaks
+# Don't add missing residues (preserve original numbering for resfiles)
+graphrelax -i input.pdb -o relaxed.pdb --ignore-missing-residues
+
+# Keep chain breaks as separate chains (don't close gaps)
+graphrelax -i input.pdb -o relaxed.pdb --retain-chainbreaks
 
 # Combine with design
-graphrelax -i input.pdb -o designed.pdb --pre-idealize --design
+graphrelax -i input.pdb -o designed.pdb --design
 ```
 
-**Note:** Pre-idealization requires pdbfixer (`mamba install -c conda-forge pdbfixer`).
+**Note:** Idealization requires pdbfixer (`mamba install -c conda-forge pdbfixer`).
 
 ### Resfile Format
 
@@ -346,15 +351,16 @@ Input preprocessing:
   --keep-ligand RES1,RES2,...
                         Keep specific ligand residues (comma-separated).
                         Example: --keep-ligand GOL,SO4
-  --pre-idealize        Idealize backbone geometry before processing.
-                        Corrects bond lengths/angles while preserving
-                        dihedral angles. By default, chain breaks are closed.
-                        Requires pdbfixer.
+  --no-idealize         Skip backbone idealization. By default, backbone
+                        geometry is idealized (bond lengths/angles corrected
+                        while preserving dihedrals). Requires pdbfixer.
   --ignore-missing-residues
                         Do not add missing residues from SEQRES during
-                        pre-idealization. By default, missing terminal and
-                        internal loop residues are added.
-  --retain-chainbreaks  Do not close chain breaks during pre-idealization.
+                        processing. By default, missing terminal and
+                        internal loop residues are added during relaxation
+                        and idealization. Use this flag to preserve
+                        original PDB residue numbering for resfile compatibility.
+  --retain-chainbreaks  Do not close chain breaks during idealization.
                         By default, chain breaks are closed by treating all
                         segments as a single chain.
 
@@ -364,6 +370,7 @@ Scoring:
 General:
   -v, --verbose         Verbose output
   --seed N              Random seed for reproducibility
+  --overwrite           Overwrite output files if they exist (default: error)
 ```
 
 ### Scorefile Output
@@ -397,7 +404,7 @@ config = PipelineConfig(
         constrained=False,  # Default: unconstrained minimization
     ),
     idealize=IdealizeConfig(
-        enabled=True,  # Enable pre-idealization
+        enabled=True,  # Idealization enabled by default
         add_missing_residues=True,  # Add missing residues from SEQRES
         close_chainbreaks=True,  # Close chain breaks (default)
     ),
diff --git a/src/graphrelax/LigandMPNN/openfold/np/relax/cleanup.py b/src/graphrelax/LigandMPNN/openfold/np/relax/cleanup.py
index 6f062a0..72b69f0 100644
--- a/src/graphrelax/LigandMPNN/openfold/np/relax/cleanup.py
+++ b/src/graphrelax/LigandMPNN/openfold/np/relax/cleanup.py
@@ -57,6 +57,9 @@ def fix_pdb(pdbfile, alterations_info):
     _remove_heterogens(fixer, alterations_info, keep_water=False)
     fixer.findMissingResidues()
     alterations_info["missing_residues"] = fixer.missingResidues
+    # Clear missing residues to preserve original residue numbering
+    # Missing residues can be added via --pre-idealize if desired
+    fixer.missingResidues = {}
     fixer.findMissingAtoms()
     alterations_info["missing_heavy_atoms"] = fixer.missingAtoms
     alterations_info["missing_terminals"] = fixer.missingTerminals
diff --git a/src/graphrelax/cli.py b/src/graphrelax/cli.py
index 2dd0469..532cd35 100644
--- a/src/graphrelax/cli.py
+++ b/src/graphrelax/cli.py
@@ -19,6 +19,43 @@ def setup_logging(verbose: bool):
     )
 
 
+def _get_output_paths(output_path: Path, n_outputs: int) -> list:
+    """
+    Get list of all output file paths that will be written.
+
+    Args:
+        output_path: Base output path
+        n_outputs: Number of outputs to generate
+
+    Returns:
+        List of Path objects for all output files
+    """
+    if n_outputs == 1:
+        return [output_path]
+
+    paths = []
+    stem = output_path.stem
+    suffix = output_path.suffix
+    for i in range(1, n_outputs + 1):
+        paths.append(output_path.parent / f"{stem}_{i}{suffix}")
+    return paths
+
+
+def _check_output_exists(output_path: Path, n_outputs: int) -> list:
+    """
+    Check if any output files already exist.
+
+    Args:
+        output_path: Base output path
+        n_outputs: Number of outputs to generate
+
+    Returns:
+        List of existing file paths
+    """
+    paths = _get_output_paths(output_path, n_outputs)
+    return [p for p in paths if p.exists()]
+
+
 def _check_for_ligands(input_path: Path, fmt) -> bool:
     """
     Check if input structure has ligands (non-water HETATM records).
@@ -281,28 +318,29 @@ def create_parser() -> argparse.ArgumentParser:
         ),
     )
     preprocess_group.add_argument(
-        "--pre-idealize",
+        "--no-idealize",
         action="store_true",
         help=(
-            "Idealize backbone geometry before processing. "
-            "Runs constrained minimization to fix local geometry while "
-            "preserving dihedral angles. By default, chain breaks are closed."
+            "Skip backbone idealization. By default, backbone geometry is "
+            "idealized before processing (corrects bond lengths/angles while "
+            "preserving dihedral angles). Use this flag to skip idealization."
         ),
     )
     preprocess_group.add_argument(
         "--ignore-missing-residues",
         action="store_true",
         help=(
-            "Do not add missing residues from SEQRES during pre-idealization. "
-            "By default, missing N/C-terminal residues and internal loops are "
-            "added based on SEQRES records."
+            "Do not add missing residues from SEQRES. By default, missing "
+            "N/C-terminal residues and internal loops are added. Use this "
+            "to preserve original PDB residue numbering for resfile "
+            "compatibility."
         ),
     )
     preprocess_group.add_argument(
         "--retain-chainbreaks",
         action="store_true",
         help=(
-            "Do not close chain breaks during pre-idealization. "
+            "Do not close chain breaks during idealization. "
             "By default, chain breaks are closed by treating all segments "
             "as a single chain. Use this to preserve gaps."
         ),
@@ -322,6 +360,11 @@ def create_parser() -> argparse.ArgumentParser:
         metavar="N",
         help="Random seed for reproducibility",
     )
+    general_group.add_argument(
+        "--overwrite",
+        action="store_true",
+        help="Overwrite output files if they exist (default: error if exists)",
+    )
 
     return parser
 
@@ -343,6 +386,23 @@ def main(args=None) -> int:
         logger.error(f"Resfile not found: {opts.resfile}")
         return 1
 
+    # Check if output files already exist (unless --overwrite is set)
+    if not opts.overwrite:
+        existing = _check_output_exists(opts.output, opts.n_outputs)
+        if existing:
+            if len(existing) == 1:
+                logger.error(
+                    f"Output file already exists: {existing[0]}. "
+                    "Use --overwrite to replace."
+                )
+            else:
+                files = ", ".join(str(p) for p in existing)
+                logger.error(
+                    f"Output files already exist: {files}. "
+                    "Use --overwrite to replace."
+                )
+            return 1
+
     # Validate input format
     input_suffix = opts.input.suffix.lower()
     if input_suffix not in (".pdb", ".cif", ".mmcif"):
@@ -426,6 +486,7 @@ def main(args=None) -> int:
         max_iterations=opts.max_iterations,
         constrained=opts.constrained_minimization,
         split_chains_at_gaps=not opts.no_split_gaps,
+        add_missing_residues=not opts.ignore_missing_residues,
         ignore_ligands=opts.ignore_ligands,
         ligand_forcefield=opts.ligand_forcefield,
         ligand_smiles=ligand_smiles,
@@ -440,7 +501,7 @@ def main(args=None) -> int:
                 keep_residues.add(resname)
 
     idealize_config = IdealizeConfig(
-        enabled=opts.pre_idealize,
+        enabled=not opts.no_idealize,
         add_missing_residues=not opts.ignore_missing_residues,
         close_chainbreaks=not opts.retain_chainbreaks,
     )
diff --git a/src/graphrelax/config.py b/src/graphrelax/config.py
index 81a126e..6597ca2 100644
--- a/src/graphrelax/config.py
+++ b/src/graphrelax/config.py
@@ -44,6 +44,7 @@ class RelaxConfig:
     max_outer_iterations: int = 3  # Violation-fixing iterations
     constrained: bool = False  # Use constrained (AmberRelaxation) minimization
     split_chains_at_gaps: bool = True  # Split chains at gaps to prevent closure
+    add_missing_residues: bool = True  # Add missing residues from SEQRES
     # GPU is auto-detected and used when available
 
     # Ligand support options (ligands are auto-detected)
@@ -60,7 +61,7 @@ class RelaxConfig:
 class IdealizeConfig:
     """Configuration for structure idealization preprocessing."""
 
-    enabled: bool = False  # Idealization disabled by default
+    enabled: bool = True  # Idealization enabled by default
     fix_cis_omega: bool = True  # Correct non-trans peptide bonds (except Pro)
     post_idealize_stiffness: float = 10.0  # kcal/mol/A^2 for restraint
     add_missing_residues: bool = True  # Add missing residues from SEQRES
diff --git a/src/graphrelax/idealize.py b/src/graphrelax/idealize.py
index 5c814e9..3ce8336 100644
--- a/src/graphrelax/idealize.py
+++ b/src/graphrelax/idealize.py
@@ -498,7 +498,7 @@ def minimize_with_constraints(
     state = simulation.context.getState(getPositions=True)
     output = io.StringIO()
     openmm_app.PDBFile.writeFile(
-        simulation.topology, state.getPositions(), output
+        simulation.topology, state.getPositions(), output, keepIds=True
     )
 
     logger.debug("Post-idealization minimization complete")
diff --git a/src/graphrelax/relaxer.py b/src/graphrelax/relaxer.py
index faa1d44..1e3955d 100644
--- a/src/graphrelax/relaxer.py
+++ b/src/graphrelax/relaxer.py
@@ -217,6 +217,11 @@ def _relax_unconstrained(
         # Use pdbfixer to add missing atoms and terminal groups
         fixer = PDBFixer(pdbfile=io.StringIO(pdb_string))
         fixer.findMissingResidues()
+        if not self.config.add_missing_residues:
+            fixer.missingResidues = {}  # Clear to preserve original numbering
+        elif fixer.missingResidues:
+            n_missing = sum(len(v) for v in fixer.missingResidues.values())
+            logger.info(f"Adding {n_missing} missing residues from SEQRES")
         fixer.findMissingAtoms()
         fixer.addMissingAtoms()
 
@@ -263,10 +268,10 @@ def _relax_unconstrained(
         # Calculate RMSD
         rmsd = np.sqrt(np.sum((posinit - pos) ** 2) / len(posinit))
 
-        # Write output PDB
+        # Write output PDB (keepIds=True preserves original residue numbering)
         output = io.StringIO()
         openmm_app.PDBFile.writeFile(
-            simulation.topology, state.getPositions(), output
+            simulation.topology, state.getPositions(), output, keepIds=True
         )
         relaxed_pdb = output.getvalue()
 
@@ -328,6 +333,11 @@ def _relax_unconstrained_with_ligands(
         # Step 2: Fix protein with pdbfixer (without ligands)
         fixer = PDBFixer(pdbfile=io.StringIO(protein_pdb))
         fixer.findMissingResidues()
+        if not self.config.add_missing_residues:
+            fixer.missingResidues = {}  # Clear to preserve original numbering
+        elif fixer.missingResidues:
+            n_missing = sum(len(v) for v in fixer.missingResidues.values())
+            logger.info(f"Adding {n_missing} missing residues from SEQRES")
         fixer.findMissingAtoms()
         fixer.addMissingAtoms()
 
@@ -410,10 +420,10 @@ def _relax_unconstrained_with_ligands(
         # Calculate RMSD
         rmsd = np.sqrt(np.sum((posinit - pos) ** 2) / len(posinit))
 
-        # Write output PDB
+        # Write output PDB (keepIds=True preserves original residue numbering)
         output = io.StringIO()
         openmm_app.PDBFile.writeFile(
-            simulation.topology, state.getPositions(), output
+            simulation.topology, state.getPositions(), output, keepIds=True
         )
         relaxed_pdb = output.getvalue()
 

From 434e11f3cb580e350cf2b8092c5e07382bbbb12e Mon Sep 17 00:00:00 2001
From: delalamo <diego.delalamo@gmail.com>
Date: Tue, 13 Jan 2026 13:09:43 -0500
Subject: [PATCH 16/22] Fix residue numbering after idealization and add
 resfile warning

- Add renumber_residues_sequential() to idealize.py to ensure sequential
  residue numbering (1, 2, 3...) per chain after pdbfixer adds missing
  residues. This fixes false chain gap detection caused by non-sequential
  numbers from pdbfixer.
- Add CLI warning when using resfile with idealization enabled, since
  residue numbers in the resfile must match the idealized structure
- Update README to document the residue renumbering behavior

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 README.md                  |  7 ++++-
 src/graphrelax/cli.py      |  9 ++++++
 src/graphrelax/idealize.py | 60 ++++++++++++++++++++++++++++++++++++++
 3 files changed, 75 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 38e4c24..ddfe753 100644
--- a/README.md
+++ b/README.md
@@ -249,7 +249,12 @@ By default, GraphRelax idealizes backbone geometry before processing. This corre
 4. Runs constrained minimization to relieve local strain
 5. By default, closes chain breaks (gaps) in the structure
 
-**Note:** By default, GraphRelax adds missing residues from SEQRES records during both relaxation and idealization. This may change residue numbering if the input PDB is missing N/C-terminal residues. Use `--ignore-missing-residues` to preserve original numbering for resfile compatibility.
+**Important:** By default, GraphRelax adds missing residues from SEQRES records during idealization and renumbers all residues sequentially starting from 1 for each chain. This ensures consistent numbering regardless of the original PDB numbering scheme.
+
+If you're using a resfile, the residue numbers must match the **idealized** structure numbering (sequential from 1), not the original PDB numbering. To preserve original numbering for resfile compatibility, use one of these options:
+
+- `--ignore-missing-residues`: Keep original numbering, don't add missing residues
+- `--no-idealize`: Skip idealization entirely (preserves original geometry and numbering)
 
 ```bash
 # Default: idealization is enabled
diff --git a/src/graphrelax/cli.py b/src/graphrelax/cli.py
index 532cd35..1f22fa1 100644
--- a/src/graphrelax/cli.py
+++ b/src/graphrelax/cli.py
@@ -456,6 +456,15 @@ def main(args=None) -> int:
         else:
             logger.info("Ligands auto-detected, using openmmforcefields")
 
+    # Warn about resfile + idealization interaction
+    if opts.resfile and not opts.no_idealize:
+        logger.warning(
+            "Using resfile with idealization enabled. Residue numbers in "
+            "the resfile should match the idealized structure (sequential "
+            "numbering starting from 1). Use --no-idealize or "
+            "--ignore-missing-residues to preserve original numbering."
+        )
+
     logger.info(f"Running GraphRelax in {mode.value} mode")
     logger.info(f"Input: {opts.input}")
     logger.info(f"Output: {opts.output}")
diff --git a/src/graphrelax/idealize.py b/src/graphrelax/idealize.py
index 3ce8336..efcc280 100644
--- a/src/graphrelax/idealize.py
+++ b/src/graphrelax/idealize.py
@@ -615,5 +615,65 @@ def idealize_structure(
     # Step 7: Restore ligands
     final_pdb = restore_ligands(minimized_pdb, ligand_lines)
 
+    # Step 8: Renumber residues sequentially per chain
+    # This fixes issues where pdbfixer assigns non-sequential numbers
+    final_pdb = renumber_residues_sequential(final_pdb)
+
     logger.info("Structure idealization complete")
     return final_pdb, gaps
+
+
+def renumber_residues_sequential(pdb_string: str) -> str:
+    """
+    Renumber residues sequentially starting from 1 for each chain.
+
+    This fixes issues where pdbfixer assigns non-sequential residue numbers
+    when adding missing residues. Sequential numbering is required for
+    proper chain gap detection and LigandMPNN processing.
+
+    HETATM records are renumbered to continue after the last ATOM residue
+    in each chain.
+
+    Args:
+        pdb_string: PDB file contents
+
+    Returns:
+        PDB string with sequential residue numbering
+    """
+    lines = pdb_string.split("\n")
+    output_lines = []
+
+    # Track residue numbering per chain
+    chain_residue_count = {}  # chain_id -> next_resnum
+    residue_map = {}  # (chain_id, old_resnum, icode) -> new_resnum
+
+    # First pass: assign new numbers to unique residues
+    for line in lines:
+        if line.startswith("ATOM") or line.startswith("HETATM"):
+            chain_id = line[21]
+            old_resnum = line[22:26].strip()
+            icode = line[26]
+            key = (chain_id, old_resnum, icode)
+
+            if key not in residue_map:
+                if chain_id not in chain_residue_count:
+                    chain_residue_count[chain_id] = 1
+                residue_map[key] = chain_residue_count[chain_id]
+                chain_residue_count[chain_id] += 1
+
+    # Second pass: apply new numbers
+    for line in lines:
+        if line.startswith("ATOM") or line.startswith("HETATM"):
+            chain_id = line[21]
+            old_resnum = line[22:26].strip()
+            icode = line[26]
+            key = (chain_id, old_resnum, icode)
+
+            new_resnum = residue_map[key]
+            # Format residue number right-justified in columns 23-26
+            new_line = line[:22] + f"{new_resnum:>4}" + " " + line[27:]
+            output_lines.append(new_line)
+        else:
+            output_lines.append(line)
+
+    return "\n".join(output_lines)

From 7368829bcd23b58ab0224bc61aed40c4810cc5ad Mon Sep 17 00:00:00 2001
From: delalamo <diego.delalamo@gmail.com>
Date: Tue, 13 Jan 2026 13:17:58 -0500
Subject: [PATCH 17/22] Include ligands in unconstrained minimization

Fix bug where ligands were extracted before the ligand-presence check,
causing the ligand-aware minimization path to never be triggered.

Now for unconstrained minimization with ligands:
- Ligands are detected before any extraction
- Full PDB (with ligands) is passed to _relax_unconstrained()
- Ligands are parameterized via openmmforcefields and minimized
  together with the protein

For constrained minimization, ligands are still extracted and restored
unchanged since AmberRelaxation cannot handle arbitrary ligands.

This fixes protein-ligand clashes that occurred when the protein moved
during minimization while ligands stayed in their original positions.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 src/graphrelax/relaxer.py | 50 +++++++++++++++++++++++++++++++++++----
 1 file changed, 46 insertions(+), 4 deletions(-)

diff --git a/src/graphrelax/relaxer.py b/src/graphrelax/relaxer.py
index 1e3955d..898df2c 100644
--- a/src/graphrelax/relaxer.py
+++ b/src/graphrelax/relaxer.py
@@ -77,9 +77,12 @@ def relax(self, pdb_string: str) -> Tuple[str, dict, np.ndarray]:
         If split_chains_at_gaps is enabled, chains will be split at detected
         gaps before minimization to prevent artificial gap closure.
 
-        Ligands (non-water HETATM records) are extracted before relaxation
-        and restored afterward, since standard AMBER force fields cannot
-        parameterize arbitrary ligands.
+        For unconstrained minimization with ligands present, ligands are
+        parameterized using openmmforcefields and included in the minimization.
+
+        For constrained minimization, ligands are extracted before relaxation
+        and restored afterward (unchanged) since AmberRelaxation cannot
+        handle arbitrary ligands.
 
         Args:
             pdb_string: PDB file contents as string
@@ -87,7 +90,46 @@ def relax(self, pdb_string: str) -> Tuple[str, dict, np.ndarray]:
         Returns:
             Tuple of (relaxed_pdb_string, debug_info, violations)
         """
-        # Extract ligands before relaxation (AMBER can't parameterize them)
+        # Check if ligands are present (non-water HETATM records)
+        has_ligands = any(
+            line.startswith("HETATM")
+            and line[17:20].strip() not in WATER_RESIDUES
+            for line in pdb_string.split("\n")
+        )
+
+        # For unconstrained minimization with ligands, use the ligand-aware path
+        # which includes ligands in the minimization via openmmforcefields
+        if not self.config.constrained and has_ligands:
+            if not self.config.ignore_ligands:
+                # Detect and handle chain gaps on protein portion only
+                chain_mapping = {}
+                if self.config.split_chains_at_gaps:
+                    protein_pdb, _ = extract_ligands(pdb_string)
+                    gaps = detect_chain_gaps(protein_pdb)
+                    if gaps:
+                        logger.info(get_gap_summary(gaps))
+                        # Split the full PDB (with ligands) at gaps
+                        pdb_string, chain_mapping = split_chains_at_gaps(
+                            pdb_string, gaps
+                        )
+
+                # Run unconstrained minimization with ligands included
+                relaxed_pdb, debug_info, violations = self._relax_unconstrained(
+                    pdb_string
+                )
+
+                # Restore original chain IDs if chains were split
+                if chain_mapping:
+                    relaxed_pdb = restore_chain_ids(relaxed_pdb, chain_mapping)
+                    debug_info["chains_split"] = True
+                    debug_info["gaps_detected"] = len(
+                        [k for k, v in chain_mapping.items() if k != v]
+                    )
+
+                return relaxed_pdb, debug_info, violations
+
+        # For constrained minimization or when ignoring ligands:
+        # Extract ligands, relax protein-only, restore ligands
         protein_pdb, ligand_lines = extract_ligands(pdb_string)
         if ligand_lines.strip():
             logger.debug(

From bad8cfcf87131bed9f5bff44017dbeea6de790b1 Mon Sep 17 00:00:00 2001
From: delalamo <diego.delalamo@gmail.com>
Date: Tue, 13 Jan 2026 13:22:00 -0500
Subject: [PATCH 18/22] Include ligands in idealization minimization

Update idealization pipeline to properly handle ligands:

1. Add missing residues and atoms (protein only)
2. Idealize bond lengths and angles
3. Minimize protein with constraints (without ligands)
4. Reintroduce ligands and minimize protein+ligand complex together

This ensures ligands move with the protein during idealization rather
than staying at their original coordinates while the protein moves.

New function minimize_with_ligands() uses openmmforcefields to
parameterize ligands and perform constrained minimization on the
full protein-ligand complex.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 src/graphrelax/idealize.py | 207 +++++++++++++++++++++++++++++++++++--
 src/graphrelax/pipeline.py |   5 +-
 2 files changed, 200 insertions(+), 12 deletions(-)

diff --git a/src/graphrelax/idealize.py b/src/graphrelax/idealize.py
index efcc280..80ac97e 100644
--- a/src/graphrelax/idealize.py
+++ b/src/graphrelax/idealize.py
@@ -28,8 +28,22 @@
     split_chains_at_gaps,
 )
 from graphrelax.config import IdealizeConfig
+from graphrelax.ligand_utils import (
+    create_openff_molecule,
+    extract_ligands_from_pdb,
+    ligand_pdb_to_topology,
+)
 from graphrelax.utils import check_gpu_available
 
+# openmmforcefields is only available via conda-forge
+try:
+    from openmmforcefields.generators import SystemGenerator
+
+    OPENMMFF_AVAILABLE = True
+except ImportError:
+    OPENMMFF_AVAILABLE = False
+    SystemGenerator = None
+
 # Add vendored LigandMPNN to path for OpenFold imports
 LIGANDMPNN_PATH = Path(__file__).parent / "LigandMPNN"
 if str(LIGANDMPNN_PATH) not in sys.path:
@@ -505,6 +519,143 @@ def minimize_with_constraints(
     return output.getvalue()
 
 
+def minimize_with_ligands(
+    pdb_string: str,
+    stiffness: float = 10.0,
+    ligand_forcefield: str = "openff-2.0.0",
+    ligand_smiles: dict = None,
+) -> str:
+    """
+    Run constrained minimization with ligand support via openmmforcefields.
+
+    This function is used as a final step in idealization when ligands are
+    present. It minimizes the protein-ligand complex with position restraints
+    on all heavy atoms.
+
+    Args:
+        pdb_string: PDB file contents (protein + ligands)
+        stiffness: Restraint force constant in kcal/mol/A^2
+        ligand_forcefield: Force field for ligands (openff-2.0.0, gaff-2.11)
+        ligand_smiles: Optional dict mapping resname -> SMILES
+
+    Returns:
+        Minimized PDB string
+
+    Raises:
+        ImportError: If openmmforcefields is not available.
+    """
+    if not OPENMMFF_AVAILABLE:
+        raise ImportError(
+            "Ligand minimization requires openmmforcefields.\n"
+            "Install with: conda install -c conda-forge openmmforcefields"
+        )
+
+    ENERGY = unit.kilocalories_per_mole
+    LENGTH = unit.angstroms
+
+    logger.debug(
+        f"Running constrained minimization with ligands "
+        f"(stiffness={stiffness}, forcefield={ligand_forcefield})"
+    )
+
+    # Separate protein and ligands
+    protein_pdb, ligands = extract_ligands_from_pdb(pdb_string)
+    ligand_names = [lig.resname for lig in ligands]
+    logger.debug(f"Found {len(ligands)} ligand(s): {ligand_names}")
+
+    # Fix protein with pdbfixer (without ligands)
+    fixer = PDBFixer(pdbfile=io.StringIO(protein_pdb))
+    fixer.findMissingResidues()
+    fixer.missingResidues = {}  # Don't add missing residues at this stage
+    fixer.findMissingAtoms()
+    fixer.addMissingAtoms()
+
+    # Create OpenFF molecules for ligands
+    user_smiles = ligand_smiles or {}
+    openff_molecules = []
+    for ligand in ligands:
+        smiles = user_smiles.get(ligand.resname)
+        try:
+            mol = create_openff_molecule(ligand, smiles=smiles)
+            openff_molecules.append(mol)
+        except Exception as e:
+            logger.warning(
+                f"Could not parameterize ligand {ligand.resname}: {e}. "
+                "Ligand will be excluded from minimization."
+            )
+            continue
+
+    # Create combined topology using Modeller
+    modeller = openmm_app.Modeller(fixer.topology, fixer.positions)
+
+    # Add ligands back to modeller
+    for ligand in ligands:
+        ligand_topology, ligand_positions = ligand_pdb_to_topology(ligand)
+        modeller.add(ligand_topology, ligand_positions)
+
+    # Create SystemGenerator with ligand support
+    system_generator = SystemGenerator(
+        forcefields=["amber/ff14SB.xml"],
+        small_molecule_forcefield=ligand_forcefield,
+        molecules=openff_molecules,
+        forcefield_kwargs={
+            "constraints": openmm_app.HBonds,
+            "removeCMMotion": True,
+        },
+    )
+
+    # Add hydrogens
+    modeller.addHydrogens(system_generator.forcefield)
+
+    # Create system
+    system = system_generator.create_system(modeller.topology)
+
+    # Add position restraints to all heavy atoms
+    restraint_force = openmm.CustomExternalForce(
+        "0.5 * k * ((x-x0)^2 + (y-y0)^2 + (z-z0)^2)"
+    )
+    stiffness_openmm = (stiffness * ENERGY / (LENGTH**2)).value_in_unit(
+        unit.kilojoules_per_mole / unit.nanometers**2
+    )
+    restraint_force.addGlobalParameter("k", stiffness_openmm)
+    for p in ["x0", "y0", "z0"]:
+        restraint_force.addPerParticleParameter(p)
+
+    for i, atom in enumerate(modeller.topology.atoms()):
+        if atom.element.name != "hydrogen":
+            pos = modeller.positions[i].value_in_unit(unit.nanometers)
+            restraint_force.addParticle(i, pos)
+
+    system.addForce(restraint_force)
+    logger.debug(
+        f"Added restraints to {restraint_force.getNumParticles()} heavy atoms"
+    )
+
+    # Check for GPU
+    use_gpu = check_gpu_available()
+
+    # Create simulation and minimize
+    integrator = openmm.LangevinIntegrator(0, 0.01, 0.0)
+    platform = Platform.getPlatformByName("CUDA" if use_gpu else "CPU")
+    simulation = openmm_app.Simulation(
+        modeller.topology, system, integrator, platform
+    )
+    simulation.context.setPositions(modeller.positions)
+
+    # Minimize
+    simulation.minimizeEnergy()
+
+    # Get minimized structure
+    state = simulation.context.getState(getPositions=True)
+    output = io.StringIO()
+    openmm_app.PDBFile.writeFile(
+        simulation.topology, state.getPositions(), output, keepIds=True
+    )
+
+    logger.debug("Constrained minimization with ligands complete")
+    return output.getvalue()
+
+
 def _idealize_chain_segment(
     residues: list,
     config: IdealizeConfig,
@@ -548,6 +699,8 @@ def _idealize_chain_segment(
 def idealize_structure(
     pdb_string: str,
     config: IdealizeConfig,
+    ligand_forcefield: str = "openff-2.0.0",
+    ligand_smiles: dict = None,
 ) -> Tuple[str, List[ChainGap]]:
     """
     Idealize backbone geometry while preserving dihedral angles.
@@ -559,13 +712,17 @@ def idealize_structure(
     2. Detect chain gaps
     3. Split chains at gaps
     4. Extract dihedrals and optionally correct cis-omega
-    5. Run constrained minimization to relieve local strain
-    6. Restore original chain IDs
-    7. Restore ligands
+    5. Add missing residues and atoms (protein only)
+    6. Run constrained minimization on protein only
+    7. Restore original chain IDs
+    8. Reintroduce ligands and minimize protein+ligand complex
+    9. Renumber residues sequentially
 
     Args:
         pdb_string: Input PDB file contents
         config: Idealization configuration
+        ligand_forcefield: Force field for ligand parameterization
+        ligand_smiles: Optional dict mapping resname -> SMILES
 
     Returns:
         Tuple of (idealized_pdb_string, list_of_chain_gaps)
@@ -574,7 +731,8 @@ def idealize_structure(
 
     # Step 1: Extract ligands
     protein_pdb, ligand_lines = extract_ligands(pdb_string)
-    if ligand_lines.strip():
+    has_ligands = bool(ligand_lines.strip())
+    if has_ligands:
         logger.info("Extracted ligands for separate handling")
 
     # Step 2: Detect chain gaps (only if we want to retain them)
@@ -599,23 +757,50 @@ def idealize_structure(
             if residues:
                 _idealize_chain_segment(residues, config)
 
-    # Step 5: Run constrained minimization
-    # This is the key step - it fixes local geometry issues while
-    # keeping the overall structure close to the original
+    # Step 5-6: Add missing residues/atoms and run constrained minimization
+    # (protein only - ligands are not included yet)
     minimized_pdb = minimize_with_constraints(
         protein_pdb,
         stiffness=config.post_idealize_stiffness,
         add_missing_residues=config.add_missing_residues,
     )
 
-    # Step 6: Restore original chain IDs
+    # Step 7: Restore original chain IDs
     if chain_mapping:
         minimized_pdb = restore_chain_ids(minimized_pdb, chain_mapping)
 
-    # Step 7: Restore ligands
-    final_pdb = restore_ligands(minimized_pdb, ligand_lines)
+    # Step 8: Reintroduce ligands and minimize protein+ligand complex
+    if has_ligands:
+        # Restore ligands to the minimized protein
+        combined_pdb = restore_ligands(minimized_pdb, ligand_lines)
+
+        # Run a second minimization with ligands included
+        # This allows protein-ligand interactions to relax together
+        if OPENMMFF_AVAILABLE:
+            logger.info("Running final minimization with ligands included")
+            try:
+                final_pdb = minimize_with_ligands(
+                    combined_pdb,
+                    stiffness=config.post_idealize_stiffness,
+                    ligand_forcefield=ligand_forcefield,
+                    ligand_smiles=ligand_smiles,
+                )
+            except Exception as e:
+                logger.warning(
+                    f"Ligand minimization failed: {e}. "
+                    "Ligands restored without minimization."
+                )
+                final_pdb = combined_pdb
+        else:
+            logger.warning(
+                "openmmforcefields not available. "
+                "Ligands restored without minimization."
+            )
+            final_pdb = combined_pdb
+    else:
+        final_pdb = minimized_pdb
 
-    # Step 8: Renumber residues sequentially per chain
+    # Step 9: Renumber residues sequentially per chain
     # This fixes issues where pdbfixer assigns non-sequential numbers
     final_pdb = renumber_residues_sequential(final_pdb)
 
diff --git a/src/graphrelax/pipeline.py b/src/graphrelax/pipeline.py
index f37b5da..7ce3193 100644
--- a/src/graphrelax/pipeline.py
+++ b/src/graphrelax/pipeline.py
@@ -205,7 +205,10 @@ def _run_single_output(
         if self.config.idealize.enabled:
             logger.info("Running pre-idealization...")
             current_pdb, gaps = idealize_structure(
-                current_pdb, self.config.idealize
+                current_pdb,
+                self.config.idealize,
+                ligand_forcefield=self.config.relax.ligand_forcefield,
+                ligand_smiles=self.config.relax.ligand_smiles,
             )
             if gaps:
                 logger.info(

From 84d6bdc65cfabcab5c8b40121d2005fe70c9dbd3 Mon Sep 17 00:00:00 2001
From: delalamo <diego.delalamo@gmail.com>
Date: Tue, 13 Jan 2026 13:30:51 -0500
Subject: [PATCH 19/22] Raise error for unparameterizable metallocofactors

When ligands containing transition metals (heme, Fe-S clusters,
chlorophylls, etc.) are detected, raise a clear error explaining
the options instead of attempting to parameterize them.

- Add UNPARAMETERIZABLE_COFACTORS set in ligand_utils.py
- Add is_unparameterizable_cofactor() check function
- Update relaxer and idealize to fail early with helpful message

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 src/graphrelax/idealize.py     | 22 ++++++++++++++--
 src/graphrelax/ligand_utils.py | 47 ++++++++++++++++++++++++++++++++++
 src/graphrelax/relaxer.py      | 16 ++++++++++++
 3 files changed, 83 insertions(+), 2 deletions(-)

diff --git a/src/graphrelax/idealize.py b/src/graphrelax/idealize.py
index 80ac97e..dd7e768 100644
--- a/src/graphrelax/idealize.py
+++ b/src/graphrelax/idealize.py
@@ -31,6 +31,7 @@
 from graphrelax.ligand_utils import (
     create_openff_molecule,
     extract_ligands_from_pdb,
+    is_unparameterizable_cofactor,
     ligand_pdb_to_topology,
 )
 from graphrelax.utils import check_gpu_available
@@ -560,8 +561,24 @@ def minimize_with_ligands(
 
     # Separate protein and ligands
     protein_pdb, ligands = extract_ligands_from_pdb(pdb_string)
+
+    # Check for unparameterizable cofactors and fail early
+    for lig in ligands:
+        if is_unparameterizable_cofactor(lig.resname):
+            raise ValueError(
+                f"Cannot parameterize cofactor '{lig.resname}' "
+                f"(chain {lig.chain_id}, res {lig.resnum}). "
+                f"Metallocofactors like heme, Fe-S clusters, and chlorophylls "
+                f"cannot be parameterized by standard force fields.\n\n"
+                f"Options:\n"
+                f"  1. Remove the cofactor from the input PDB\n"
+                f"  2. Use --ignore-ligands to exclude all ligands\n"
+                f"  3. Use --no-idealize to skip idealization"
+            )
+
     ligand_names = [lig.resname for lig in ligands]
-    logger.debug(f"Found {len(ligands)} ligand(s): {ligand_names}")
+    if ligands:
+        logger.debug(f"Found {len(ligands)} ligand(s): {ligand_names}")
 
     # Fix protein with pdbfixer (without ligands)
     fixer = PDBFixer(pdbfile=io.StringIO(protein_pdb))
@@ -651,9 +668,10 @@ def minimize_with_ligands(
     openmm_app.PDBFile.writeFile(
         simulation.topology, state.getPositions(), output, keepIds=True
     )
+    result = output.getvalue()
 
     logger.debug("Constrained minimization with ligands complete")
-    return output.getvalue()
+    return result
 
 
 def _idealize_chain_segment(
diff --git a/src/graphrelax/ligand_utils.py b/src/graphrelax/ligand_utils.py
index c774e07..0849d01 100644
--- a/src/graphrelax/ligand_utils.py
+++ b/src/graphrelax/ligand_utils.py
@@ -141,6 +141,53 @@ def get_ion_smiles() -> Dict[str, str]:
     }
 
 
+# Cofactors that contain metals or unusual chemistry that cannot be
+# parameterized by standard force fields (GAFF, OpenFF, etc.)
+# These ligands will be excluded from minimization and restored unchanged.
+UNPARAMETERIZABLE_COFACTORS = {
+    # Heme and porphyrins (contain Fe)
+    "HEM",
+    "HEC",  # Heme C
+    "HEA",  # Heme A
+    "HEB",  # Heme B
+    "1HE",  # Heme variant
+    "2HE",
+    "DHE",  # Deuteroheme
+    "HAS",  # Heme-AS
+    "HDD",  # Hydroxyheme
+    "HEO",  # Heme O
+    "HNI",  # Heme N
+    "SRM",  # Siroheme
+    # Iron-sulfur clusters
+    "SF4",  # 4Fe-4S cluster
+    "FES",  # 2Fe-2S cluster
+    "F3S",  # 3Fe-4S cluster
+    # Other metallocofactors
+    "CLA",  # Chlorophyll A
+    "CLB",  # Chlorophyll B
+    "BCL",  # Bacteriochlorophyll
+    "BPH",  # Bacteriopheophytin
+    "PHO",  # Pheophytin
+    "CHL",  # Chlorophyll
+    "B12",  # Vitamin B12 / Cobalamin
+    "COB",  # Cobalamin
+    "PQQ",  # Pyrroloquinoline quinone
+    "MTE",  # Methanopterin
+    "F43",  # Coenzyme F430 (Ni)
+    "MO7",  # Molybdopterin
+    "MGD",  # Molybdopterin guanine dinucleotide
+    # Copper centers
+    "CU1",  # Copper site
+    "CUA",  # CuA center
+    "CUB",  # CuB center
+}
+
+
+def is_unparameterizable_cofactor(resname: str) -> bool:
+    """Check if a residue is a known unparameterizable cofactor."""
+    return resname.upper() in UNPARAMETERIZABLE_COFACTORS
+
+
 def is_single_atom_ligand(ligand: LigandInfo) -> bool:
     """Check if ligand is a single atom (ion)."""
     return len(ligand.pdb_lines) == 1
diff --git a/src/graphrelax/relaxer.py b/src/graphrelax/relaxer.py
index 898df2c..4067120 100644
--- a/src/graphrelax/relaxer.py
+++ b/src/graphrelax/relaxer.py
@@ -23,6 +23,7 @@
 from graphrelax.ligand_utils import (
     create_openff_molecule,
     extract_ligands_from_pdb,
+    is_unparameterizable_cofactor,
     ligand_pdb_to_topology,
 )
 from graphrelax.utils import check_gpu_available
@@ -369,6 +370,21 @@ def _relax_unconstrained_with_ligands(
 
         # Step 1: Separate protein and ligands
         protein_pdb, ligands = extract_ligands_from_pdb(pdb_string)
+
+        # Check for unparameterizable cofactors and fail early
+        for lig in ligands:
+            if is_unparameterizable_cofactor(lig.resname):
+                raise ValueError(
+                    f"Cannot parameterize cofactor '{lig.resname}' "
+                    f"(chain {lig.chain_id}, res {lig.resnum}). "
+                    f"Metallocofactors like heme, Fe-S clusters, and "
+                    f"chlorophylls cannot be parameterized.\n\n"
+                    f"Options:\n"
+                    f"  1. Remove the cofactor from the input PDB\n"
+                    f"  2. Use --ignore-ligands to exclude all ligands\n"
+                    f"  3. Use --constrained-minimization (protein-only)"
+                )
+
         ligand_names = [lig.resname for lig in ligands]
         logger.info(f"Found {len(ligands)} ligand(s): {ligand_names}")
 

From bc5394765dc74bffedadb339e398d8d57e3e3a1b Mon Sep 17 00:00:00 2001
From: delalamo <diego.delalamo@gmail.com>
Date: Tue, 13 Jan 2026 13:36:31 -0500
Subject: [PATCH 20/22] Remove silent error handling for ligand
 parameterization

Don't silently catch and continue when ligand parameterization fails
during idealization - let it fail with a clear error message instead
of failing later during relaxation.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 src/graphrelax/idealize.py | 28 ++++++++++------------------
 1 file changed, 10 insertions(+), 18 deletions(-)

diff --git a/src/graphrelax/idealize.py b/src/graphrelax/idealize.py
index dd7e768..0aab84a 100644
--- a/src/graphrelax/idealize.py
+++ b/src/graphrelax/idealize.py
@@ -793,28 +793,20 @@ def idealize_structure(
         combined_pdb = restore_ligands(minimized_pdb, ligand_lines)
 
         # Run a second minimization with ligands included
-        # This allows protein-ligand interactions to relax together
         if OPENMMFF_AVAILABLE:
             logger.info("Running final minimization with ligands included")
-            try:
-                final_pdb = minimize_with_ligands(
-                    combined_pdb,
-                    stiffness=config.post_idealize_stiffness,
-                    ligand_forcefield=ligand_forcefield,
-                    ligand_smiles=ligand_smiles,
-                )
-            except Exception as e:
-                logger.warning(
-                    f"Ligand minimization failed: {e}. "
-                    "Ligands restored without minimization."
-                )
-                final_pdb = combined_pdb
+            final_pdb = minimize_with_ligands(
+                combined_pdb,
+                stiffness=config.post_idealize_stiffness,
+                ligand_forcefield=ligand_forcefield,
+                ligand_smiles=ligand_smiles,
+            )
         else:
-            logger.warning(
-                "openmmforcefields not available. "
-                "Ligands restored without minimization."
+            raise ImportError(
+                "Ligand minimization requires openmmforcefields.\n"
+                "Install with: conda install -c conda-forge openmmforcefields\n"
+                "Or use --ignore-ligands to exclude ligands."
             )
-            final_pdb = combined_pdb
     else:
         final_pdb = minimized_pdb
 

From f586cc73b4028826e62a07d0acb71f9667bc77a3 Mon Sep 17 00:00:00 2001
From: delalamo <diego.delalamo@gmail.com>
Date: Tue, 13 Jan 2026 13:39:54 -0500
Subject: [PATCH 21/22] Don't silently skip ligands that fail parameterization

If create_openff_molecule() fails, let the error propagate rather
than silently skipping the ligand and then failing later.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 src/graphrelax/idealize.py | 11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)

diff --git a/src/graphrelax/idealize.py b/src/graphrelax/idealize.py
index 0aab84a..0da9cf0 100644
--- a/src/graphrelax/idealize.py
+++ b/src/graphrelax/idealize.py
@@ -592,15 +592,8 @@ def minimize_with_ligands(
     openff_molecules = []
     for ligand in ligands:
         smiles = user_smiles.get(ligand.resname)
-        try:
-            mol = create_openff_molecule(ligand, smiles=smiles)
-            openff_molecules.append(mol)
-        except Exception as e:
-            logger.warning(
-                f"Could not parameterize ligand {ligand.resname}: {e}. "
-                "Ligand will be excluded from minimization."
-            )
-            continue
+        mol = create_openff_molecule(ligand, smiles=smiles)
+        openff_molecules.append(mol)
 
     # Create combined topology using Modeller
     modeller = openmm_app.Modeller(fixer.topology, fixer.positions)

From 4d8ec60c7d516d5b436a6ef8b11ab2594c5199bb Mon Sep 17 00:00:00 2001
From: delalamo <diego.delalamo@gmail.com>
Date: Tue, 13 Jan 2026 15:32:00 -0500
Subject: [PATCH 22/22] Simplify ligand handling: use exclusion zones instead
 of parameterization

- Remove openmmforcefields dependency for ligand handling
- Add ligand exclusion zone approach in relaxer.py: ligand atoms are
  added as massless dummy particles with LJ repulsion, preventing
  protein from moving into ligand space during minimization
- Simplify idealize.py: ligands are extracted before protein
  minimization and restored afterward at original positions
- Fix PyTorch deprecation warning in tensor_utils.py (use tuple for
  multidimensional indexing)
- Add PDBe SMILES fetching for ligand identification (ligand_utils.py)
- Remove complex ligand parameterization code that was failing due to
  PDB files lacking bond information for HETATM records

This approach is more robust because:
1. No need for ligand force field parameters
2. Works with any ligand without SMILES
3. Protein minimizes fully while respecting ligand positions

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 .../LigandMPNN/openfold/utils/tensor_utils.py |   2 +-
 src/graphrelax/cli.py                         |  10 +
 src/graphrelax/config.py                      |   1 +
 src/graphrelax/idealize.py                    | 197 +---------
 src/graphrelax/ligand_utils.py                | 145 ++++++-
 src/graphrelax/pipeline.py                    |   2 -
 src/graphrelax/relaxer.py                     | 365 ++++++++----------
 tests/test_ligand_utils.py                    |  45 +++
 8 files changed, 357 insertions(+), 410 deletions(-)

diff --git a/src/graphrelax/LigandMPNN/openfold/utils/tensor_utils.py b/src/graphrelax/LigandMPNN/openfold/utils/tensor_utils.py
index ccc7e7a..bb6fc97 100644
--- a/src/graphrelax/LigandMPNN/openfold/utils/tensor_utils.py
+++ b/src/graphrelax/LigandMPNN/openfold/utils/tensor_utils.py
@@ -91,7 +91,7 @@ def batched_gather(data, inds, dim=0, no_batch_dims=0):
     ]
     remaining_dims[dim - no_batch_dims if dim >= 0 else dim] = inds
     ranges.extend(remaining_dims)
-    return data[ranges]
+    return data[tuple(ranges)]
 
 
 # With tree_map, a poor man's JAX tree_map
diff --git a/src/graphrelax/cli.py b/src/graphrelax/cli.py
index fe9ac0e..010b7c2 100644
--- a/src/graphrelax/cli.py
+++ b/src/graphrelax/cli.py
@@ -282,6 +282,15 @@ def create_parser() -> argparse.ArgumentParser:
             "Can be used multiple times. Example: --ligand-smiles LIG:SMILES"
         ),
     )
+    relax_group.add_argument(
+        "--no-fetch-ligand-smiles",
+        action="store_true",
+        help=(
+            "Disable automatic SMILES lookup from PDBe Chemical Component "
+            "Dictionary. By default, if RDKit cannot infer bond topology from "
+            "coordinates, SMILES are fetched from PDBe."
+        ),
+    )
 
     # Scoring options
     score_group = parser.add_argument_group("Scoring options")
@@ -499,6 +508,7 @@ def main(args=None) -> int:
         ignore_ligands=opts.ignore_ligands,
         ligand_forcefield=opts.ligand_forcefield,
         ligand_smiles=ligand_smiles,
+        fetch_pdbe_smiles=not opts.no_fetch_ligand_smiles,
     )
 
     # Build keep_residues set from --keep-ligand flag (comma-separated)
diff --git a/src/graphrelax/config.py b/src/graphrelax/config.py
index 6597ca2..2bb3ff3 100644
--- a/src/graphrelax/config.py
+++ b/src/graphrelax/config.py
@@ -55,6 +55,7 @@ class RelaxConfig:
     ligand_smiles: Dict[str, str] = field(
         default_factory=dict
     )  # {resname: SMILES}
+    fetch_pdbe_smiles: bool = True  # Auto-fetch SMILES from PDBe CCD
 
 
 @dataclass
diff --git a/src/graphrelax/idealize.py b/src/graphrelax/idealize.py
index 0da9cf0..af7b8c0 100644
--- a/src/graphrelax/idealize.py
+++ b/src/graphrelax/idealize.py
@@ -28,23 +28,8 @@
     split_chains_at_gaps,
 )
 from graphrelax.config import IdealizeConfig
-from graphrelax.ligand_utils import (
-    create_openff_molecule,
-    extract_ligands_from_pdb,
-    is_unparameterizable_cofactor,
-    ligand_pdb_to_topology,
-)
 from graphrelax.utils import check_gpu_available
 
-# openmmforcefields is only available via conda-forge
-try:
-    from openmmforcefields.generators import SystemGenerator
-
-    OPENMMFF_AVAILABLE = True
-except ImportError:
-    OPENMMFF_AVAILABLE = False
-    SystemGenerator = None
-
 # Add vendored LigandMPNN to path for OpenFold imports
 LIGANDMPNN_PATH = Path(__file__).parent / "LigandMPNN"
 if str(LIGANDMPNN_PATH) not in sys.path:
@@ -520,153 +505,6 @@ def minimize_with_constraints(
     return output.getvalue()
 
 
-def minimize_with_ligands(
-    pdb_string: str,
-    stiffness: float = 10.0,
-    ligand_forcefield: str = "openff-2.0.0",
-    ligand_smiles: dict = None,
-) -> str:
-    """
-    Run constrained minimization with ligand support via openmmforcefields.
-
-    This function is used as a final step in idealization when ligands are
-    present. It minimizes the protein-ligand complex with position restraints
-    on all heavy atoms.
-
-    Args:
-        pdb_string: PDB file contents (protein + ligands)
-        stiffness: Restraint force constant in kcal/mol/A^2
-        ligand_forcefield: Force field for ligands (openff-2.0.0, gaff-2.11)
-        ligand_smiles: Optional dict mapping resname -> SMILES
-
-    Returns:
-        Minimized PDB string
-
-    Raises:
-        ImportError: If openmmforcefields is not available.
-    """
-    if not OPENMMFF_AVAILABLE:
-        raise ImportError(
-            "Ligand minimization requires openmmforcefields.\n"
-            "Install with: conda install -c conda-forge openmmforcefields"
-        )
-
-    ENERGY = unit.kilocalories_per_mole
-    LENGTH = unit.angstroms
-
-    logger.debug(
-        f"Running constrained minimization with ligands "
-        f"(stiffness={stiffness}, forcefield={ligand_forcefield})"
-    )
-
-    # Separate protein and ligands
-    protein_pdb, ligands = extract_ligands_from_pdb(pdb_string)
-
-    # Check for unparameterizable cofactors and fail early
-    for lig in ligands:
-        if is_unparameterizable_cofactor(lig.resname):
-            raise ValueError(
-                f"Cannot parameterize cofactor '{lig.resname}' "
-                f"(chain {lig.chain_id}, res {lig.resnum}). "
-                f"Metallocofactors like heme, Fe-S clusters, and chlorophylls "
-                f"cannot be parameterized by standard force fields.\n\n"
-                f"Options:\n"
-                f"  1. Remove the cofactor from the input PDB\n"
-                f"  2. Use --ignore-ligands to exclude all ligands\n"
-                f"  3. Use --no-idealize to skip idealization"
-            )
-
-    ligand_names = [lig.resname for lig in ligands]
-    if ligands:
-        logger.debug(f"Found {len(ligands)} ligand(s): {ligand_names}")
-
-    # Fix protein with pdbfixer (without ligands)
-    fixer = PDBFixer(pdbfile=io.StringIO(protein_pdb))
-    fixer.findMissingResidues()
-    fixer.missingResidues = {}  # Don't add missing residues at this stage
-    fixer.findMissingAtoms()
-    fixer.addMissingAtoms()
-
-    # Create OpenFF molecules for ligands
-    user_smiles = ligand_smiles or {}
-    openff_molecules = []
-    for ligand in ligands:
-        smiles = user_smiles.get(ligand.resname)
-        mol = create_openff_molecule(ligand, smiles=smiles)
-        openff_molecules.append(mol)
-
-    # Create combined topology using Modeller
-    modeller = openmm_app.Modeller(fixer.topology, fixer.positions)
-
-    # Add ligands back to modeller
-    for ligand in ligands:
-        ligand_topology, ligand_positions = ligand_pdb_to_topology(ligand)
-        modeller.add(ligand_topology, ligand_positions)
-
-    # Create SystemGenerator with ligand support
-    system_generator = SystemGenerator(
-        forcefields=["amber/ff14SB.xml"],
-        small_molecule_forcefield=ligand_forcefield,
-        molecules=openff_molecules,
-        forcefield_kwargs={
-            "constraints": openmm_app.HBonds,
-            "removeCMMotion": True,
-        },
-    )
-
-    # Add hydrogens
-    modeller.addHydrogens(system_generator.forcefield)
-
-    # Create system
-    system = system_generator.create_system(modeller.topology)
-
-    # Add position restraints to all heavy atoms
-    restraint_force = openmm.CustomExternalForce(
-        "0.5 * k * ((x-x0)^2 + (y-y0)^2 + (z-z0)^2)"
-    )
-    stiffness_openmm = (stiffness * ENERGY / (LENGTH**2)).value_in_unit(
-        unit.kilojoules_per_mole / unit.nanometers**2
-    )
-    restraint_force.addGlobalParameter("k", stiffness_openmm)
-    for p in ["x0", "y0", "z0"]:
-        restraint_force.addPerParticleParameter(p)
-
-    for i, atom in enumerate(modeller.topology.atoms()):
-        if atom.element.name != "hydrogen":
-            pos = modeller.positions[i].value_in_unit(unit.nanometers)
-            restraint_force.addParticle(i, pos)
-
-    system.addForce(restraint_force)
-    logger.debug(
-        f"Added restraints to {restraint_force.getNumParticles()} heavy atoms"
-    )
-
-    # Check for GPU
-    use_gpu = check_gpu_available()
-
-    # Create simulation and minimize
-    integrator = openmm.LangevinIntegrator(0, 0.01, 0.0)
-    platform = Platform.getPlatformByName("CUDA" if use_gpu else "CPU")
-    simulation = openmm_app.Simulation(
-        modeller.topology, system, integrator, platform
-    )
-    simulation.context.setPositions(modeller.positions)
-
-    # Minimize
-    simulation.minimizeEnergy()
-
-    # Get minimized structure
-    state = simulation.context.getState(getPositions=True)
-    output = io.StringIO()
-    openmm_app.PDBFile.writeFile(
-        simulation.topology, state.getPositions(), output, keepIds=True
-    )
-    result = output.getvalue()
-
-    logger.debug("Constrained minimization with ligands complete")
-    return result
-
-
 def _idealize_chain_segment(
     residues: list,
     config: IdealizeConfig,
@@ -710,8 +548,6 @@ def _idealize_chain_segment(
 def idealize_structure(
     pdb_string: str,
     config: IdealizeConfig,
-    ligand_forcefield: str = "openff-2.0.0",
-    ligand_smiles: dict = None,
 ) -> Tuple[str, List[ChainGap]]:
     """
     Idealize backbone geometry while preserving dihedral angles.
@@ -726,14 +562,16 @@ def idealize_structure(
     5. Add missing residues and atoms (protein only)
     6. Run constrained minimization on protein only
     7. Restore original chain IDs
-    8. Reintroduce ligands and minimize protein+ligand complex
+    8. Reintroduce ligands (ligands are kept at original positions)
     9. Renumber residues sequentially
 
+    Note: Ligands are NOT minimized during idealization. They are extracted
+    before protein minimization and restored afterward at their original
+    positions. This avoids the need for ligand force field parameterization.
+
     Args:
         pdb_string: Input PDB file contents
         config: Idealization configuration
-        ligand_forcefield: Force field for ligand parameterization
-        ligand_smiles: Optional dict mapping resname -> SMILES
 
     Returns:
         Tuple of (idealized_pdb_string, list_of_chain_gaps)
@@ -780,26 +618,15 @@ def idealize_structure(
     if chain_mapping:
         minimized_pdb = restore_chain_ids(minimized_pdb, chain_mapping)
 
-    # Step 8: Reintroduce ligands and minimize protein+ligand complex
+    # Step 8: Reintroduce ligands
     if has_ligands:
         # Restore ligands to the minimized protein
-        combined_pdb = restore_ligands(minimized_pdb, ligand_lines)
-
-        # Run a second minimization with ligands included
-        if OPENMMFF_AVAILABLE:
-            logger.info("Running final minimization with ligands included")
-            final_pdb = minimize_with_ligands(
-                combined_pdb,
-                stiffness=config.post_idealize_stiffness,
-                ligand_forcefield=ligand_forcefield,
-                ligand_smiles=ligand_smiles,
-            )
-        else:
-            raise ImportError(
-                "Ligand minimization requires openmmforcefields.\n"
-                "Install with: conda install -c conda-forge openmmforcefields\n"
-                "Or use --ignore-ligands to exclude ligands."
-            )
+        # Ligands are kept at their original positions (not minimized)
+        # This avoids the need for ligand force field parameterization
+        logger.info(
+            "Restoring ligands to minimized protein (ligands held fixed)"
+        )
+        final_pdb = restore_ligands(minimized_pdb, ligand_lines)
     else:
         final_pdb = minimized_pdb
 
diff --git a/src/graphrelax/ligand_utils.py b/src/graphrelax/ligand_utils.py
index 0849d01..a369fdf 100644
--- a/src/graphrelax/ligand_utils.py
+++ b/src/graphrelax/ligand_utils.py
@@ -1,7 +1,10 @@
 """Utilities for ligand parameterization and handling."""
 
 import io
+import json
 import logging
+import urllib.error
+import urllib.request
 from dataclasses import dataclass
 from typing import Dict, List, Optional, Tuple
 
@@ -188,32 +191,102 @@ def is_unparameterizable_cofactor(resname: str) -> bool:
     return resname.upper() in UNPARAMETERIZABLE_COFACTORS
 
 
+# In-memory cache for PDBe SMILES lookups
+_PDBE_SMILES_CACHE = {}
+
+
+def fetch_pdbe_smiles(resname: str) -> Optional[str]:
+    """
+    Fetch SMILES from PDBe Chemical Component Dictionary.
+
+    Args:
+        resname: Three-letter ligand code (e.g., "ATP", "3JD")
+
+    Returns:
+        SMILES string if found, None otherwise
+    """
+    resname_upper = resname.upper()
+
+    # Check cache first
+    if resname_upper in _PDBE_SMILES_CACHE:
+        cached = _PDBE_SMILES_CACHE[resname_upper]
+        if cached is not None:
+            logger.debug(f"Using cached SMILES for {resname_upper}")
+        return cached
+
+    url = f"https://www.ebi.ac.uk/pdbe/api/pdb/compound/summary/{resname_upper}"
+
+    try:
+        with urllib.request.urlopen(url, timeout=5) as response:
+            data = json.loads(response.read().decode("utf-8"))
+
+        if resname_upper in data and data[resname_upper]:
+            compound_data = data[resname_upper][0]
+            if "smiles" in compound_data and compound_data["smiles"]:
+                smiles = compound_data["smiles"][0]["name"]
+                _PDBE_SMILES_CACHE[resname_upper] = smiles
+                logger.info(f"Fetched SMILES for {resname_upper} from PDBe")
+                return smiles
+
+    except urllib.error.HTTPError as e:
+        if e.code == 404:
+            logger.debug(f"Ligand {resname_upper} not found in PDBe CCD")
+        else:
+            logger.warning(
+                f"HTTP error fetching SMILES for {resname_upper}: {e}"
+            )
+    except urllib.error.URLError as e:
+        logger.warning(
+            f"Network error fetching SMILES for {resname_upper}: {e}"
+        )
+    except json.JSONDecodeError as e:
+        logger.warning(f"JSON parse error for {resname_upper}: {e}")
+    except Exception as e:
+        logger.warning(
+            f"Unexpected error fetching SMILES for {resname_upper}: {e}"
+        )
+
+    # Cache the failure too
+    _PDBE_SMILES_CACHE[resname_upper] = None
+    return None
+
+
 def is_single_atom_ligand(ligand: LigandInfo) -> bool:
     """Check if ligand is a single atom (ion)."""
     return len(ligand.pdb_lines) == 1
 
 
-def create_openff_molecule(ligand: LigandInfo, smiles: Optional[str] = None):
+def create_openff_molecule(
+    ligand: LigandInfo,
+    smiles: Optional[str] = None,
+    fetch_pdbe: bool = True,
+):
     """
     Create an OpenFF Toolkit Molecule from ligand info.
 
     Attempts to create the molecule in this order:
     1. From user-provided SMILES (if given)
     2. From ion lookup table (for single-atom ligands)
-    3. From PDB coordinates via RDKit bond perception
-    4. From direct OpenFF PDB parsing
+    3. From PDBe Chemical Component Dictionary (if fetch_pdbe=True)
+    4. From PDB coordinates via RDKit bond perception (fallback)
+
+    Note: PDBe lookup is preferred over RDKit because RDKit's bond perception
+    from 3D coordinates often fails or produces incorrect molecules for complex
+    organic ligands. PDBe has correct SMILES for all standard PDB ligands.
 
     Requires openff-toolkit and rdkit (install via conda-forge).
 
     Args:
         ligand: LigandInfo with PDB coordinates
         smiles: Optional SMILES string (overrides automatic detection)
+        fetch_pdbe: If True, try PDBe CCD before RDKit (recommended)
 
     Returns:
         openff.toolkit.Molecule
 
     Raises:
         ImportError: If openff-toolkit or rdkit are not installed.
+        ValueError: If molecule cannot be created by any method.
     """
     _check_ligand_deps()
 
@@ -221,6 +294,7 @@ def create_openff_molecule(ligand: LigandInfo, smiles: Optional[str] = None):
     if smiles:
         try:
             mol = Molecule.from_smiles(smiles, allow_undefined_stereo=True)
+            mol.name = ligand.resname  # Required for openmmforcefields matching
             logger.debug(f"Created molecule for {ligand.resname} from SMILES")
             return mol
         except Exception as e:
@@ -233,6 +307,7 @@ def create_openff_molecule(ligand: LigandInfo, smiles: Optional[str] = None):
             mol = Molecule.from_smiles(
                 ion_smiles[ligand.resname], allow_undefined_stereo=True
             )
+            mol.name = ligand.resname  # Required for openmmforcefields matching
             logger.debug(f"Created ion molecule for {ligand.resname}")
             return mol
         else:
@@ -241,27 +316,59 @@ def create_openff_molecule(ligand: LigandInfo, smiles: Optional[str] = None):
                 f"ligand_smiles={{'{ligand.resname}': '[Element+charge]'}}"
             )
 
-    # 3. Try RDKit bond perception from PDB coordinates
+    # 3. Try PDBe Chemical Component Dictionary first (most reliable)
+    if fetch_pdbe:
+        pdbe_smiles = fetch_pdbe_smiles(ligand.resname)
+        if pdbe_smiles:
+            try:
+                mol = Molecule.from_smiles(
+                    pdbe_smiles, allow_undefined_stereo=True
+                )
+                mol.name = (
+                    ligand.resname
+                )  # Required for openmmforcefields matching
+                logger.info(
+                    f"Created molecule for {ligand.resname} using PDBe SMILES"
+                )
+                return mol
+            except Exception as e:
+                logger.warning(f"Failed to create from PDBe SMILES: {e}")
+
+    # 4. Fallback: Try RDKit bond perception from PDB coordinates
+    # Note: This often produces incorrect molecules for complex organic ligands
     pdb_block = "\n".join(ligand.pdb_lines) + "\nEND\n"
+    rdkit_error = None
     try:
         mol = _create_molecule_via_rdkit(pdb_block)
+        mol.name = ligand.resname  # Required for openmmforcefields matching
         logger.debug(f"Created molecule for {ligand.resname} via RDKit")
         return mol
     except Exception as e:
+        rdkit_error = e
         logger.debug(f"RDKit parsing failed: {e}")
 
-    # 4. Fallback: direct OpenFF PDB parsing
+    # 5. Last resort: Try direct OpenFF PDB parsing
+    openff_error = None
     try:
         mol = Molecule.from_pdb_file(
             io.StringIO(pdb_block),
             allow_undefined_stereo=True,
         )
+        mol.name = ligand.resname  # Required for openmmforcefields matching
         logger.debug(f"Created molecule for {ligand.resname} from PDB")
         return mol
     except Exception as e:
-        raise ValueError(
-            f"Could not create molecule for ligand {ligand.resname}: {e}"
-        )
+        openff_error = e
+        logger.debug(f"OpenFF PDB parsing failed: {e}")
+
+    # All methods failed
+    raise ValueError(
+        f"Could not create molecule for ligand {ligand.resname}.\n"
+        f"RDKit error: {rdkit_error}\n"
+        f"OpenFF error: {openff_error}\n\n"
+        f"You can provide SMILES manually via --ligand-smiles "
+        f"'{ligand.resname}:YOUR_SMILES_HERE'"
+    )
 
 
 def _create_molecule_via_rdkit(pdb_block: str):
@@ -287,16 +394,34 @@ def _create_molecule_via_rdkit(pdb_block: str):
     return Molecule.from_rdkit(mol, allow_undefined_stereo=True)
 
 
-def ligand_pdb_to_topology(ligand: LigandInfo):
+def ligand_pdb_to_topology(ligand: LigandInfo, strip_hydrogens: bool = False):
     """
     Convert ligand PDB lines to OpenMM topology and positions.
 
     Args:
         ligand: LigandInfo with PDB lines
+        strip_hydrogens: If True, remove hydrogen atoms from the topology.
+            Useful when hydrogens will be added later by addHydrogens().
 
     Returns:
         Tuple of (topology, positions)
     """
-    pdb_block = "\n".join(ligand.pdb_lines) + "\nEND\n"
+    if strip_hydrogens:
+        # Filter out hydrogen atoms (element column is at position 77-78)
+        heavy_atom_lines = []
+        for line in ligand.pdb_lines:
+            if len(line) >= 78:
+                element = line[76:78].strip()
+                if element != "H":
+                    heavy_atom_lines.append(line)
+            else:
+                # Try to detect hydrogen from atom name (starts with H)
+                atom_name = line[12:16].strip() if len(line) >= 16 else ""
+                if not atom_name.startswith("H"):
+                    heavy_atom_lines.append(line)
+        pdb_block = "\n".join(heavy_atom_lines) + "\nEND\n"
+    else:
+        pdb_block = "\n".join(ligand.pdb_lines) + "\nEND\n"
+
     pdb_file = openmm_app.PDBFile(io.StringIO(pdb_block))
     return pdb_file.topology, pdb_file.positions
diff --git a/src/graphrelax/pipeline.py b/src/graphrelax/pipeline.py
index 7ce3193..d7c258d 100644
--- a/src/graphrelax/pipeline.py
+++ b/src/graphrelax/pipeline.py
@@ -207,8 +207,6 @@ def _run_single_output(
             current_pdb, gaps = idealize_structure(
                 current_pdb,
                 self.config.idealize,
-                ligand_forcefield=self.config.relax.ligand_forcefield,
-                ligand_smiles=self.config.relax.ligand_smiles,
             )
             if gaps:
                 logger.info(
diff --git a/src/graphrelax/relaxer.py b/src/graphrelax/relaxer.py
index 4067120..9359def 100644
--- a/src/graphrelax/relaxer.py
+++ b/src/graphrelax/relaxer.py
@@ -4,14 +4,13 @@
 import logging
 import sys
 from pathlib import Path
-from typing import Tuple
+from typing import Optional, Tuple
 
 import numpy as np
+from openmm import Platform
 from openmm import app as openmm_app
 from openmm import openmm, unit
-from pdbfixer import PDBFixer
 
-from graphrelax.artifacts import WATER_RESIDUES
 from graphrelax.chain_gaps import (
     detect_chain_gaps,
     get_gap_summary,
@@ -20,35 +19,6 @@
 )
 from graphrelax.config import RelaxConfig
 from graphrelax.idealize import extract_ligands, restore_ligands
-from graphrelax.ligand_utils import (
-    create_openff_molecule,
-    extract_ligands_from_pdb,
-    is_unparameterizable_cofactor,
-    ligand_pdb_to_topology,
-)
-from graphrelax.utils import check_gpu_available
-
-# openmmforcefields is only available via conda-forge
-# Install with: conda install -c conda-forge openmmforcefields=0.13.0
-try:
-    from openmmforcefields.generators import SystemGenerator
-
-    OPENMMFF_AVAILABLE = True
-except ImportError:
-    OPENMMFF_AVAILABLE = False
-    SystemGenerator = None
-
-
-def _check_openmmff():
-    """Raise ImportError if openmmforcefields is not available."""
-    if not OPENMMFF_AVAILABLE:
-        raise ImportError(
-            "Ligand support requires openmmforcefields.\n"
-            "This package is only available via conda-forge:\n\n"
-            "  conda install -c conda-forge openmmforcefields=0.13.0\n\n"
-            "See the README for full installation instructions."
-        )
-
 
 # Add vendored LigandMPNN to path for OpenFold imports
 # Must happen before importing from openfold
@@ -67,6 +37,22 @@ class Relaxer:
 
     def __init__(self, config: RelaxConfig):
         self.config = config
+        self._use_gpu: Optional[bool] = None
+
+    def _check_gpu_available(self) -> bool:
+        """Check if CUDA is available for OpenMM."""
+        if self._use_gpu is not None:
+            return self._use_gpu
+
+        for i in range(Platform.getNumPlatforms()):
+            if Platform.getPlatform(i).getName() == "CUDA":
+                self._use_gpu = True
+                logger.info("OpenMM CUDA platform detected, using GPU")
+                return True
+
+        self._use_gpu = False
+        logger.info("OpenMM CUDA not available, using CPU")
+        return False
 
     def relax(self, pdb_string: str) -> Tuple[str, dict, np.ndarray]:
         """
@@ -78,12 +64,9 @@ def relax(self, pdb_string: str) -> Tuple[str, dict, np.ndarray]:
         If split_chains_at_gaps is enabled, chains will be split at detected
         gaps before minimization to prevent artificial gap closure.
 
-        For unconstrained minimization with ligands present, ligands are
-        parameterized using openmmforcefields and included in the minimization.
-
-        For constrained minimization, ligands are extracted before relaxation
-        and restored afterward (unchanged) since AmberRelaxation cannot
-        handle arbitrary ligands.
+        Ligands (non-water HETATM records) are extracted before relaxation
+        and restored afterward. Protein atoms near ligands are restrained
+        to prevent clashes when ligands are restored.
 
         Args:
             pdb_string: PDB file contents as string
@@ -91,51 +74,15 @@ def relax(self, pdb_string: str) -> Tuple[str, dict, np.ndarray]:
         Returns:
             Tuple of (relaxed_pdb_string, debug_info, violations)
         """
-        # Check if ligands are present (non-water HETATM records)
-        has_ligands = any(
-            line.startswith("HETATM")
-            and line[17:20].strip() not in WATER_RESIDUES
-            for line in pdb_string.split("\n")
-        )
-
-        # For unconstrained minimization with ligands, use the ligand-aware path
-        # which includes ligands in the minimization via openmmforcefields
-        if not self.config.constrained and has_ligands:
-            if not self.config.ignore_ligands:
-                # Detect and handle chain gaps on protein portion only
-                chain_mapping = {}
-                if self.config.split_chains_at_gaps:
-                    protein_pdb, _ = extract_ligands(pdb_string)
-                    gaps = detect_chain_gaps(protein_pdb)
-                    if gaps:
-                        logger.info(get_gap_summary(gaps))
-                        # Split the full PDB (with ligands) at gaps
-                        pdb_string, chain_mapping = split_chains_at_gaps(
-                            pdb_string, gaps
-                        )
-
-                # Run unconstrained minimization with ligands included
-                relaxed_pdb, debug_info, violations = self._relax_unconstrained(
-                    pdb_string
-                )
-
-                # Restore original chain IDs if chains were split
-                if chain_mapping:
-                    relaxed_pdb = restore_chain_ids(relaxed_pdb, chain_mapping)
-                    debug_info["chains_split"] = True
-                    debug_info["gaps_detected"] = len(
-                        [k for k, v in chain_mapping.items() if k != v]
-                    )
-
-                return relaxed_pdb, debug_info, violations
-
-        # For constrained minimization or when ignoring ligands:
-        # Extract ligands, relax protein-only, restore ligands
+        # Extract ligands before relaxation (AMBER can't parameterize them)
         protein_pdb, ligand_lines = extract_ligands(pdb_string)
+        ligand_coords = None
         if ligand_lines.strip():
             logger.debug(
                 "Extracted ligands for separate handling during relaxation"
             )
+            # Parse ligand coordinates to restrain nearby protein atoms
+            ligand_coords = self._parse_ligand_coords(ligand_lines)
 
         # Detect and handle chain gaps if configured
         chain_mapping = {}
@@ -152,7 +99,7 @@ def relax(self, pdb_string: str) -> Tuple[str, dict, np.ndarray]:
             relaxed_pdb, debug_info, violations = self.relax_protein(prot)
         else:
             relaxed_pdb, debug_info, violations = self._relax_unconstrained(
-                protein_pdb
+                protein_pdb, ligand_coords=ligand_coords
             )
 
         # Restore original chain IDs if chains were split
@@ -168,6 +115,20 @@ def relax(self, pdb_string: str) -> Tuple[str, dict, np.ndarray]:
 
         return relaxed_pdb, debug_info, violations
 
+    def _parse_ligand_coords(self, ligand_lines: str) -> np.ndarray:
+        """Parse ligand atom coordinates from HETATM lines."""
+        coords = []
+        for line in ligand_lines.split("\n"):
+            if line.startswith("HETATM"):
+                try:
+                    x = float(line[30:38])
+                    y = float(line[38:46])
+                    z = float(line[46:54])
+                    coords.append([x, y, z])
+                except (ValueError, IndexError):
+                    continue
+        return np.array(coords) if coords else None
+
     def relax_pdb_file(self, pdb_path: Path) -> Tuple[str, dict, np.ndarray]:
         """
         Relax a PDB file.
@@ -192,7 +153,7 @@ def relax_protein(self, prot) -> Tuple[str, dict, np.ndarray]:
         Returns:
             Tuple of (relaxed_pdb_string, debug_info, violations)
         """
-        use_gpu = check_gpu_available()
+        use_gpu = self._check_gpu_available()
 
         relaxer = AmberRelaxation(
             max_iterations=self.config.max_iterations,
@@ -219,52 +180,40 @@ def relax_protein(self, prot) -> Tuple[str, dict, np.ndarray]:
         return relaxed_pdb, debug_data, violations
 
     def _relax_unconstrained(
-        self, pdb_string: str
+        self, pdb_string: str, ligand_coords: np.ndarray = None
     ) -> Tuple[str, dict, np.ndarray]:
         """
         Bare-bones unconstrained OpenMM minimization.
 
-        No position restraints, no violation checking, uses OpenMM defaults.
-        This is the default minimization mode.
-
-        Ligands are auto-detected. If present and ignore_ligands is False,
-        uses openmmforcefields for ligand parameterization.
+        No position restraints on protein, uses OpenMM defaults.
+        If ligand_coords is provided, adds fixed "dummy" particles at those
+        positions with LJ repulsion to prevent protein from clashing with
+        ligand positions.
 
         Args:
             pdb_string: PDB file contents as string (protein-only)
+            ligand_coords: Optional array of ligand atom positions (Angstroms)
 
         Returns:
             Tuple of (relaxed_pdb_string, debug_info, violations)
         """
-        # Check if ligands are present (non-water HETATM records)
-        has_ligands = any(
-            line.startswith("HETATM")
-            and line[17:20].strip() not in WATER_RESIDUES
-            for line in pdb_string.split("\n")
-        )
-
-        # Auto-detect ligands and use openmmforcefields unless ignore_ligands
-        if has_ligands and not self.config.ignore_ligands:
-            return self._relax_unconstrained_with_ligands(pdb_string)
-
         ENERGY = unit.kilocalories_per_mole
         LENGTH = unit.angstroms
 
-        use_gpu = check_gpu_available()
+        use_gpu = self._check_gpu_available()
 
+        has_ligand = ligand_coords is not None and len(ligand_coords) > 0
         logger.info(
             f"Running unconstrained OpenMM minimization "
-            f"(max_iter={self.config.max_iterations}, gpu={use_gpu})"
+            f"(max_iter={self.config.max_iterations}, gpu={use_gpu}"
+            f"{', with ligand exclusion zone' if has_ligand else ''})"
         )
 
         # Use pdbfixer to add missing atoms and terminal groups
+        from pdbfixer import PDBFixer
+
         fixer = PDBFixer(pdbfile=io.StringIO(pdb_string))
         fixer.findMissingResidues()
-        if not self.config.add_missing_residues:
-            fixer.missingResidues = {}  # Clear to preserve original numbering
-        elif fixer.missingResidues:
-            n_missing = sum(len(v) for v in fixer.missingResidues.values())
-            logger.info(f"Adding {n_missing} missing residues from SEQRES")
         fixer.findMissingAtoms()
         fixer.addMissingAtoms()
 
@@ -282,6 +231,43 @@ def _relax_unconstrained(
             modeller.topology, constraints=openmm_app.HBonds
         )
 
+        n_protein_atoms = system.getNumParticles()
+
+        # Add ligand atoms as fixed dummy particles with LJ repulsion
+        ligand_particle_indices = []
+        if has_ligand:
+            # Add a custom nonbonded force for ligand-protein repulsion
+            # Using soft-core LJ potential to prevent singularities
+            ligand_repulsion = openmm.CustomNonbondedForce(
+                "epsilon * (sigma/r)^12; "
+                "sigma=0.3; epsilon=4.0"  # 3 Angstrom radius, 4 kJ/mol
+            )
+            ligand_repulsion.setNonbondedMethod(
+                openmm.CustomNonbondedForce.CutoffNonPeriodic
+            )
+            ligand_repulsion.setCutoffDistance(1.2 * unit.nanometers)
+
+            # Add all protein atoms to the force
+            for _ in range(n_protein_atoms):
+                ligand_repulsion.addParticle([])
+
+            # Add ligand dummy particles to the system
+            for _ in ligand_coords:
+                # Add massless particle (won't move)
+                idx = system.addParticle(0.0)
+                ligand_particle_indices.append(idx)
+                ligand_repulsion.addParticle([])
+
+            # Set interaction groups: protein interacts with ligand dummies
+            protein_set = set(range(n_protein_atoms))
+            ligand_set = set(ligand_particle_indices)
+            ligand_repulsion.addInteractionGroup(protein_set, ligand_set)
+
+            system.addForce(ligand_repulsion)
+            logger.debug(
+                f"Added {len(ligand_coords)} ligand exclusion particles"
+            )
+
         # Create integrator and simulation
         integrator = openmm.LangevinIntegrator(0, 0.01, 0.0)
         platform = openmm.Platform.getPlatformByName(
@@ -290,7 +276,18 @@ def _relax_unconstrained(
         simulation = openmm_app.Simulation(
             modeller.topology, system, integrator, platform
         )
-        simulation.context.setPositions(modeller.positions)
+
+        # Set positions: protein from modeller, ligand dummies from coords
+        positions = list(modeller.positions)
+        if has_ligand:
+            for coord in ligand_coords:
+                # Convert Angstroms to nanometers
+                positions.append(
+                    openmm.Vec3(coord[0], coord[1], coord[2])
+                    * 0.1
+                    * unit.nanometers
+                )
+        simulation.context.setPositions(positions)
 
         # Get initial energy
         state = simulation.context.getState(getEnergy=True, getPositions=True)
@@ -308,13 +305,18 @@ def _relax_unconstrained(
         efinal = state.getPotentialEnergy().value_in_unit(ENERGY)
         pos = state.getPositions(asNumpy=True).value_in_unit(LENGTH)
 
-        # Calculate RMSD
-        rmsd = np.sqrt(np.sum((posinit - pos) ** 2) / len(posinit))
+        # Calculate RMSD (protein atoms only)
+        rmsd = np.sqrt(
+            np.sum((posinit[:n_protein_atoms] - pos[:n_protein_atoms]) ** 2)
+            / n_protein_atoms
+        )
 
-        # Write output PDB (keepIds=True preserves original residue numbering)
+        # Write output PDB (protein only - exclude dummy ligand particles)
         output = io.StringIO()
+        # Get only protein positions
+        protein_positions = state.getPositions()[:n_protein_atoms]
         openmm_app.PDBFile.writeFile(
-            simulation.topology, state.getPositions(), output, keepIds=True
+            modeller.topology, protein_positions, output
         )
         relaxed_pdb = output.getvalue()
 
@@ -324,6 +326,8 @@ def _relax_unconstrained(
             "rmsd": rmsd,
             "attempts": 1,
         }
+        if has_ligand:
+            debug_data["ligand_exclusion_atoms"] = len(ligand_coords)
 
         logger.info(
             f"Minimization complete: E_init={einit:.2f}, "
@@ -335,121 +339,55 @@ def _relax_unconstrained(
 
         return relaxed_pdb, debug_data, violations
 
-    def _relax_unconstrained_with_ligands(
-        self, pdb_string: str
-    ) -> Tuple[str, dict, np.ndarray]:
+    def _relax_direct(self, pdb_string: str) -> Tuple[str, dict, np.ndarray]:
         """
-        Unconstrained OpenMM minimization with ligand support.
+        Direct OpenMM minimization without pdbfixer.
 
-        Uses openmmforcefields to parameterize ligands with GAFF2/OpenFF.
-        The protein is processed separately with pdbfixer, then combined
-        with parameterized ligands for minimization.
-
-        Requires openmmforcefields (install via conda-forge).
+        This is a simpler approach that works for already-complete structures
+        (like those from LigandMPNN with packed side chains).
 
         Args:
             pdb_string: PDB file contents as string
 
         Returns:
             Tuple of (relaxed_pdb_string, debug_info, violations)
-
-        Raises:
-            ImportError: If openmmforcefields is not installed.
         """
-        _check_openmmff()
-
         ENERGY = unit.kilocalories_per_mole
         LENGTH = unit.angstroms
 
-        use_gpu = check_gpu_available()
+        use_gpu = self._check_gpu_available()
 
         logger.info(
-            f"Running unconstrained minimization with ligands "
-            f"(forcefield={self.config.ligand_forcefield}, gpu={use_gpu})"
+            f"Running direct OpenMM minimization "
+            f"(max_iter={self.config.max_iterations}, "
+            f"stiffness={self.config.stiffness}, gpu={use_gpu})"
         )
 
-        # Step 1: Separate protein and ligands
-        protein_pdb, ligands = extract_ligands_from_pdb(pdb_string)
-
-        # Check for unparameterizable cofactors and fail early
-        for lig in ligands:
-            if is_unparameterizable_cofactor(lig.resname):
-                raise ValueError(
-                    f"Cannot parameterize cofactor '{lig.resname}' "
-                    f"(chain {lig.chain_id}, res {lig.resnum}). "
-                    f"Metallocofactors like heme, Fe-S clusters, and "
-                    f"chlorophylls cannot be parameterized.\n\n"
-                    f"Options:\n"
-                    f"  1. Remove the cofactor from the input PDB\n"
-                    f"  2. Use --ignore-ligands to exclude all ligands\n"
-                    f"  3. Use --constrained-minimization (protein-only)"
-                )
-
-        ligand_names = [lig.resname for lig in ligands]
-        logger.info(f"Found {len(ligands)} ligand(s): {ligand_names}")
+        # Parse PDB
+        pdb_file = io.StringIO(pdb_string)
+        pdb = openmm_app.PDBFile(pdb_file)
 
-        # Step 2: Fix protein with pdbfixer (without ligands)
-        fixer = PDBFixer(pdbfile=io.StringIO(protein_pdb))
-        fixer.findMissingResidues()
-        if not self.config.add_missing_residues:
-            fixer.missingResidues = {}  # Clear to preserve original numbering
-        elif fixer.missingResidues:
-            n_missing = sum(len(v) for v in fixer.missingResidues.values())
-            logger.info(f"Adding {n_missing} missing residues from SEQRES")
-        fixer.findMissingAtoms()
-        fixer.addMissingAtoms()
-
-        # Step 3: Create OpenFF molecules for ligands
-        # User-provided SMILES override automatic detection
-        user_smiles = self.config.ligand_smiles or {}
-
-        openff_molecules = []
-        for ligand in ligands:
-            smiles = user_smiles.get(ligand.resname)
-            try:
-                mol = create_openff_molecule(ligand, smiles=smiles)
-                openff_molecules.append(mol)
-                logger.debug(f"Created OpenFF molecule for {ligand.resname}")
-            except Exception as e:
-                resname = ligand.resname
-                raise ValueError(
-                    f"Could not parameterize ligand '{resname}' "
-                    f"(chain {ligand.chain_id}, res {ligand.resnum}): {e}\n\n"
-                    f"Options to resolve:\n"
-                    f"  1. Provide SMILES via config: "
-                    f"ligand_smiles={{'{resname}': 'SMILES_STRING'}}\n"
-                    f"  2. Use constrained minimization: constrained=True\n"
-                    f"  3. Remove the ligand from the input PDB\n\n"
-                    f"For common ligands, see: "
-                    f"https://www.ebi.ac.uk/pdbe-srv/pdbechem/"
-                )
+        # Create force field and system
+        force_field = openmm_app.ForceField(
+            "amber14-all.xml", "amber14/tip3pfb.xml"
+        )
 
-        # Step 4: Create combined topology using Modeller
-        modeller = openmm_app.Modeller(fixer.topology, fixer.positions)
+        # Use Modeller to add hydrogens (doesn't require pdbfixer)
+        modeller = openmm_app.Modeller(pdb.topology, pdb.positions)
+        modeller.addHydrogens(force_field)
 
-        # Add ligands back to modeller
-        for ligand in ligands:
-            ligand_topology, ligand_positions = ligand_pdb_to_topology(ligand)
-            modeller.add(ligand_topology, ligand_positions)
-
-        # Step 5: Create SystemGenerator with ligand support
-        system_generator = SystemGenerator(
-            forcefields=["amber/ff14SB.xml"],
-            small_molecule_forcefield=self.config.ligand_forcefield,
-            molecules=openff_molecules,
-            forcefield_kwargs={
-                "constraints": openmm_app.HBonds,
-                "removeCMMotion": True,
-            },
+        # Create system with constraints on hydrogen bonds
+        system = force_field.createSystem(
+            modeller.topology, constraints=openmm_app.HBonds
         )
 
-        # Add hydrogens
-        modeller.addHydrogens(system_generator.forcefield)
-
-        # Create system
-        system = system_generator.create_system(modeller.topology)
+        # Add position restraints if stiffness > 0
+        if self.config.stiffness > 0:
+            self._add_restraints(
+                system, modeller, self.config.stiffness * ENERGY / (LENGTH**2)
+            )
 
-        # Step 6: Run minimization
+        # Create integrator and simulation
         integrator = openmm.LangevinIntegrator(0, 0.01, 0.0)
         platform = openmm.Platform.getPlatformByName(
             "CUDA" if use_gpu else "CPU"
@@ -465,10 +403,13 @@ def _relax_unconstrained_with_ligands(
         posinit = state.getPositions(asNumpy=True).value_in_unit(LENGTH)
 
         # Minimize
-        if self.config.max_iterations > 0:
-            simulation.minimizeEnergy(maxIterations=self.config.max_iterations)
-        else:
-            simulation.minimizeEnergy()
+        # OpenMM minimizeEnergy tolerance is in kJ/mol/nm (gradient threshold)
+        tolerance = (
+            self.config.tolerance * unit.kilojoules_per_mole / unit.nanometer
+        )
+        simulation.minimizeEnergy(
+            maxIterations=self.config.max_iterations, tolerance=tolerance
+        )
 
         # Get final state
         state = simulation.context.getState(getEnergy=True, getPositions=True)
@@ -478,10 +419,10 @@ def _relax_unconstrained_with_ligands(
         # Calculate RMSD
         rmsd = np.sqrt(np.sum((posinit - pos) ** 2) / len(posinit))
 
-        # Write output PDB (keepIds=True preserves original residue numbering)
+        # Write output PDB
         output = io.StringIO()
         openmm_app.PDBFile.writeFile(
-            simulation.topology, state.getPositions(), output, keepIds=True
+            simulation.topology, state.getPositions(), output
         )
         relaxed_pdb = output.getvalue()
 
@@ -490,16 +431,16 @@ def _relax_unconstrained_with_ligands(
             "final_energy": efinal,
             "rmsd": rmsd,
             "attempts": 1,
-            "ligands_included": [lig.resname for lig in ligands],
-            "ligand_forcefield": self.config.ligand_forcefield,
         }
 
         logger.info(
-            f"Minimization complete: E_init={einit:.2f}, "
+            f"Relaxation complete: E_init={einit:.2f}, "
             f"E_final={efinal:.2f}, RMSD={rmsd:.3f} A"
         )
 
+        # No violations tracking in direct mode
         violations = np.zeros(0)
+
         return relaxed_pdb, debug_data, violations
 
     def _add_restraints(self, system, modeller, stiffness):
@@ -551,7 +492,7 @@ def get_energy_breakdown(self, pdb_string: str) -> dict:
             )
 
             # Create simulation
-            use_gpu = check_gpu_available()
+            use_gpu = self._check_gpu_available()
             platform = openmm.Platform.getPlatformByName(
                 "CUDA" if use_gpu else "CPU"
             )
diff --git a/tests/test_ligand_utils.py b/tests/test_ligand_utils.py
index 5187941..d4c7b9d 100644
--- a/tests/test_ligand_utils.py
+++ b/tests/test_ligand_utils.py
@@ -1,9 +1,13 @@
 """Unit tests for ligand utilities."""
 
+import pytest
+
 from graphrelax.ligand_utils import (
+    _PDBE_SMILES_CACHE,
     WATER_RESIDUES,
     LigandInfo,
     extract_ligands_from_pdb,
+    fetch_pdbe_smiles,
     get_ion_smiles,
     is_single_atom_ligand,
 )
@@ -181,3 +185,44 @@ def test_tip_models(self):
         assert "TIP3" in WATER_RESIDUES
         assert "TIP4" in WATER_RESIDUES
         assert "SPC" in WATER_RESIDUES
+
+
+class TestFetchPdbeSmiles:
+    """Tests for PDBe SMILES fetching."""
+
+    def setup_method(self):
+        """Clear cache before each test."""
+        _PDBE_SMILES_CACHE.clear()
+
+    @pytest.mark.network
+    def test_fetch_known_ligand(self):
+        """Test fetching SMILES for a known PDB ligand."""
+        # ATP is a well-known ligand
+        smiles = fetch_pdbe_smiles("ATP")
+        assert smiles is not None
+        assert len(smiles) > 0
+        # Cache should be populated
+        assert "ATP" in _PDBE_SMILES_CACHE
+
+    @pytest.mark.network
+    def test_fetch_unknown_ligand(self):
+        """Test fetching SMILES for a non-existent ligand."""
+        smiles = fetch_pdbe_smiles("ZZZZZ")
+        assert smiles is None
+        # Failure should also be cached
+        assert "ZZZZZ" in _PDBE_SMILES_CACHE
+        assert _PDBE_SMILES_CACHE["ZZZZZ"] is None
+
+    def test_cache_hit(self):
+        """Test that cached SMILES are returned."""
+        # Pre-populate cache
+        _PDBE_SMILES_CACHE["CACHED"] = "CCC"
+        smiles = fetch_pdbe_smiles("CACHED")
+        assert smiles == "CCC"
+
+    def test_case_insensitive(self):
+        """Test that lookup is case-insensitive."""
+        _PDBE_SMILES_CACHE["TEST"] = "C=O"
+        # Lowercase should find uppercase cache entry
+        smiles = fetch_pdbe_smiles("test")
+        assert smiles == "C=O"